<p> <code><br /> <%@LANGUAGE="VBSCRIPT" CODEPAGE="936"%><br /><!-- #include file="conn.asp" --><br /><!-- #include file="inc/function.asp" --><br /><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><br /><html><br /><head><br /><title>Untitled Document</title><br /><meta http-equiv="Content-Type" content="text/html; charset=gb2312"><br /><meta http-equiv="refresh" content="300;URL=steal_house.asp"><br /></head><br /><body><br /><%<br />on error resume next<br />'<br />Server.ScriptTimeout = 999999<br />'========================================================<br />'字符编码函数<br />'====================================================<br />Function BytesToBstr(body,code)<br /> dim objstream<br /> set objstream = Server.CreateObject("adodb.stream")<br /> objstream.Type = 1<br /> objstream.Mode =3<br /> objstream.Open<br /> objstream.Write body<br /> objstream.Position = 0<br /> objstream.Type = 2<br /> objstream.Charset =code<br /> BytesToBstr = objstream.ReadText <br /> objstream.Close<br /> set objstream = nothing<br />End Function<br />'取行字符串在另一字符串中的出现位置<br />Function Newstring(wstr,strng)<br /> Newstring=Instr(lcase(wstr),lcase(strng))<br /> if Newstring<=0 then Newstring=Len(wstr)<br />End Function<br />'替换字符串函数<br />function ReplaceStr(ori,str1,str2)<br />ReplaceStr=replace(ori,str1,str2)<br />end function<br />'====================================================<br />function ReadXml(url,code,start,ends)<br />set oSend=createobject("Microsoft.XMLHTTP")<br />SourceCode = oSend.open ("GET",url,false)<br />oSend.send()<br />ReadXml=BytesToBstr(oSend.responseBody,code )<br />start=Instr(ReadXml,start)<br />ReadXml=mid(ReadXml,start)<br />ends=Instr(ReadXml,ends)<br />ReadXml=left(ReadXml,ends-1)<br />end function<br />function SubStr(body,start,ends)<br />start=Instr(body,start)<br />SubStr=mid(body,start+len(start)+1)<br />ends=Instr(SubStr,ends)<br />SubStr=left(SubStr,ends-1)<br />end function<br />dim getcont,NewsContent<br />dim url,title<br />url="http://www.***.com"'新闻网址<br />getcont=ReadXml(url,"gb2312","<table class=k2 border=""0""","</table>")<br />getcont=RegexHtml(getcont)<br />dim KeyId,NewsClass,City,Position,HouseType,Level,Area,Price,Demostra<br />dim ContactMan,Contact<br />for i=2 to ubound(getcont)<br /> response.Write(getcont(i)&"__<br>")<br /> <br /> tempLink=mid(getcont(i),instr(getcont(i),"href="" mce_href=""")+6,instr(getcont(i),""" onClick")-10)<br /> tempLink=replace(tempLink,"../","")<br /> response.Write(i&":"&tempLink&"<br>")<br /> NewsContent=ReadXml(tempLink,"gb2312","<td valign=""bottom"" width=""400"">","<hr width=""760"" noshade size=""1"" color=""#808080""> ")<br /> NewsContent=RemoveHtml(NewsContent)<br /> NewsContent=replace(NewsContent,VbCrLf,"")<br /> NewsContent=replace(NewsContent,vbNewLine,"")<br /> NewsContent=replace(NewsContent," ","")<br /> NewsContent=replace(NewsContent," ","")<br /> NewsContent=replace(NewsContent," ","") <br /> NewsContent=replace(NewsContent,"\n","") <br /> NewsContent=replace(NewsContent,chr(10),"")<br /> NewsContent=replace(NewsContent,chr(13),"")<br /> '===============get Content=======================<br /> response.Write(NewsContent)<br /> KeyId=SubStr(NewsContent,"列号:","信息类别:")<br /> NewsClass=SubStr(NewsContent,"类别:","所在城市:")<br /> City=SubStr(NewsContent,"城市:","房屋具体位置:")<br /> Position=SubStr(NewsContent,"位置:","房屋类型:")<br /> HouseType=SubStr(NewsContent,"类型:","楼层:")<br /> Level=SubStr(NewsContent,"楼层:","使用面积:")<br /> Area=SubStr(NewsContent,"面积:","房价:")<br /> Price=SubStr(NewsContent,"房价:","其他说明:")<br /> Demostra=SubStr(NewsContent,"说明:","联系人:")<br /> ContactMan=SubStr(NewsContent,"联系人:","联系方式:")<br /> Contact=SubStr(NewsContent,"联系方式:","信息来源:") <br /> response.Write("总序列号:"&KeyId&"<br>")<br /> response.Write("信息类别:"&NewsClass&"<br>")<br /> response.Write("所在城市:"&City&"<br>")<br /> response.Write("房屋具体位置:"&Position&"<br>")<br /> response.Write("房屋类型:"&HouseType&"<br>")<br /> response.Write("楼层:"&Level&"<br>")<br /> response.Write("使用面积:"&Area&"<br>")<br /> response.Write("房价:"&Price&"<br>")<br /> response.Write("其他说明:"&Demostra&"<br>")<br /> response.Write("联系人:"&ContactMan&"<br>")<br /> response.Write("联系方式:"&Contact&"<br>")<br /> 'title=RemoveHTML(aa(i))<br /> 'response.Write("title:"&title)<br /> for n=0 to application.Contents.count<br />  if(application.Contents(n)=KeyId) then<br /> ifexit=true  <br />  end if  <br /> next <br /> if not ifexit then<br />  application(time&i)=KeyId<br /> '添加到数据库<br /> '====================================================<br /> set rs=server.CreateObject("adodb.recordset")<br /> rs.open "select top 1 * from news order by id desc",conn,3,3<br /> rs.addnew<br /> rs("NewsClass")=NewsClass<br /> rs("City")=City<br /> rs("Position")=Position<br /> rs("HouseType")=HouseType<br /> rs("Level")=Level<br /> rs("Area")=Area<br /> rs("Price")=Price<br /> rs("Demostra")=Demostra<br /> rs("ContactMan")=ContactMan<br /> rs("Contact")=Contact<br /> rs.update<br /> rs.close<br /> set rs=nothing<br /> end if<br /> '==================================================<br /> <br />next<br />function RemoveTag(body)<br /> Set regEx = New RegExp<br /> regEx.Pattern = "<[a].*?<\/[a]>"<br /> regEx.IgnoreCase = True<br /> regEx.Global = True<br /> Set Matches = regEx.Execute(body) <br /> dim i,arr(15),ifexit<br /> i=0<br /> j=0<br /> For Each Match in Matches<br /> TempStr = Match.Value <br /> TempStr=replace(TempStr,"<td>","")<br /> TempStr=replace(TempStr,"</td>","")<br /> TempStr=replace(TempStr,"<tr>","")<br /> TempStr=replace(TempStr,"</tr>","") <br /> arr(i)=TempStr <br /> i=i+1<br /> if(i>=15) then<br />  exit for<br /> end if<br /> Next<br /> Set regEx=nothing<br /> Set Matches =nothing<br /> RemoveTag=arr<br /> <br />end function<br />function RegexHtml(body)<br /> dim r_arr(47),r_temp<br /> Set regEx2 = New RegExp<br /> regEx2.Pattern ="<a.*?<\/a>"<br /> regEx2.IgnoreCase = True<br /> regEx2.Global = True<br /> Set Matches2 = regEx2.Execute(body) <br /> iii=0 <br /> For Each Match in Matches2<br /> r_arr(iii)=Match.Value<br /> iii=iii+1 <br /> Next<br /> RegexHtml=r_arr<br /> set regEx2=nothing<br /> set Matches2=nothing<br />end function<br />'======================================================<br />conn.close<br />set conn=nothing<br />%><br /></body><br /></html><br /></code></p><p> function.asp</p><code><br /> <%<br />'**************************************************<br />'函数名:gotTopic<br />'作 用:截字符串,汉字一个算两个字符,英文算一个字符<br />'参 数:str ----原字符串<br />' strlen ----截取长度<br />'返回值:截取后的字符串<br />'**************************************************<br />function gotTopic(str,strlen)<br /> if str="" then<br /> gotTopic=""<br /> exit function<br /> end if<br /> dim l,t,c, i<br /> str=replace(replace(replace(replace(str," "," "),""",chr(34)),">",">"),"<","<")<br /> str=replace(str,"?","")<br /> l=len(str)<br /> t=0<br /> for i=1 to l<br /> c=Abs(Asc(Mid(str,i,1)))<br /> if c>255 then<br />  t=t+2<br /> else<br />  t=t+1<br /> end if<br /> if t>=strlen then<br />  gotTopic=left(str,i) & "…"<br />  exit for<br /> else<br />  gotTopic=str<br /> end if<br /> next<br /> gotTopic=replace(replace(replace(replace(gotTopic," "," "),chr(34),"""),">",">"),"<","<")<br />end function<br />'=========================================================<br />'函数:RemoveHTML(strHTML)<br />'功能:去除HTML标记<br />'参数:strHTML --要去除HTML标记的字符串<br />'=========================================================<br />Function RemoveHTML(strHTML)<br />Dim objRegExp, Match, Matches<br />Set objRegExp = New Regexp<br />objRegExp.IgnoreCase = True<br />objRegExp.Global = True<br />'取闭合的<><br />objRegExp.Pattern = "<.+?>"<br />'进行匹配<br />Set Matches = objRegExp.Execute(strHTML)<br />' 遍历匹配集合,并替换掉匹配的项目<br />For Each Match in Matches<br />strHtml=Replace(strHTML,Match.Value,"")<br />Next<br />RemoveHTML=strHTML<br />Set objRegExp = Nothing<br />set Matches=nothing<br />End Function<br />%><br /></code><p> conn.asp</p><code><br /> <%<br />'on error resume next<br />set conn=server.CreateObject("adodb.connection")<br />con= "driver={Microsoft Access Driver (*.mdb)};dbq=" & Server.MapPath("stest.mdb")<br />conn.open con<br />sub connclose <br /> conn.close<br /> set conn=nothing <br />end sub<br />%><br /></code><p> 附:抓取信息的详细页面事例</p><table cellspacing="1" cellpadding="0"><tr><td><p> 总序列号:</p></td><td><p> 479280 </p></td></tr><tr><td><p> 信息类别:</p></td><td><p> 出租</p></td></tr><tr><td><p> 所在城市:</p></td><td><p> 济南</p></td></tr><tr><td><p> 房屋具体位置:</p></td><td><p> 华龙路华信路交界口</p></td></tr><tr><td><p> 房屋类型:</p></td><td><p> 其他</p></td></tr><tr><td><p> 楼层:</p></td><td><p> 六层</p></td></tr><tr><td><p> 使用面积:</p></td><td><p> 24~240 平方米之间</p></td></tr><tr><td><p> 房价:</p></td><td><p> 0 [租赁:元/月,买卖:万元/套]</p></td></tr><tr><td><p> 其他说明:</p></td><td><p> 华信商务楼3至6层小空间对外出租(0.5元/平起),本楼属纯商务办公投资使用,可用于办公写字间,周边设施齐全、交通便利(37、80、K95在本楼前经过),全产权、市证,楼内设施包括水、电、暖、电梯设施齐全,有意者可电讯!</p></td></tr><tr><td><p> 联系人:</p></td><td><p> 鲁、王</p></td></tr><tr><td><p> 联系方式:</p></td><td><p> 88017966、86812217 </p></td></tr><tr><td><p> 信息来源:</p></td><td><p> 2005-8-4 8:28:55 来自:218.98.86.175</p></td></tr><tr><td><p> 点击次数:</p></td><td><p> 19</p></td></tr></table>