«
实例讲解ASP实现抓取网上房产信息

时间:2008-5-31    作者:Deri    分类: 分享


   <p>  <code><br />&#160;<%@LANGUAGE="VBSCRIPT" CODEPAGE="936"%><br /><!-- #include file="conn.asp" --><br /><!-- #include file="inc/function.asp" --><br /><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><br /><html><br /><head><br /><title>Untitled Document</title><br /><meta http-equiv="Content-Type" content="text/html; charset=gb2312"><br /><meta http-equiv="refresh" content="300;URL=steal_house.asp"><br /></head><br /><body><br /><%<br />on error resume next<br />'<br />Server.ScriptTimeout = 999999<br />'========================================================<br />'字符编码函数<br />'====================================================<br />Function BytesToBstr(body,code)<br />    dim objstream<br />    set objstream = Server.CreateObject("adodb.stream")<br />    objstream.Type = 1<br />    objstream.Mode =3<br />    objstream.Open<br />    objstream.Write body<br />    objstream.Position = 0<br />    objstream.Type = 2<br />    objstream.Charset =code<br />    BytesToBstr = objstream.ReadText&#160;<br />    objstream.Close<br />    set objstream = nothing<br />End Function<br />'取行字符串在另一字符串中的出现位置<br />Function Newstring(wstr,strng)<br />    Newstring=Instr(lcase(wstr),lcase(strng))<br />    if Newstring<=0 then Newstring=Len(wstr)<br />End Function<br />'替换字符串函数<br />function ReplaceStr(ori,str1,str2)<br />ReplaceStr=replace(ori,str1,str2)<br />end function<br />'====================================================<br />function ReadXml(url,code,start,ends)<br />set oSend=createobject("Microsoft.XMLHTTP")<br />SourceCode = oSend.open ("GET",url,false)<br />oSend.send()<br />ReadXml=BytesToBstr(oSend.responseBody,code )<br />start=Instr(ReadXml,start)<br />ReadXml=mid(ReadXml,start)<br />ends=Instr(ReadXml,ends)<br />ReadXml=left(ReadXml,ends-1)<br />end function<br />function SubStr(body,start,ends)<br />start=Instr(body,start)<br />SubStr=mid(body,start+len(start)+1)<br />ends=Instr(SubStr,ends)<br />SubStr=left(SubStr,ends-1)<br />end function<br />dim getcont,NewsContent<br />dim url,title<br />url="http://www.***.com"'新闻网址<br />getcont=ReadXml(url,"gb2312","<table class=k2 border=""0""","</table>")<br />getcont=RegexHtml(getcont)<br />dim KeyId,NewsClass,City,Position,HouseType,Level,Area,Price,Demostra<br />dim ContactMan,Contact<br />for i=2 to ubound(getcont)<br />&#160;response.Write(getcont(i)&"__<br>")<br />&#160;<br />&#160;tempLink=mid(getcont(i),instr(getcont(i),"href="" mce_href=""")+6,instr(getcont(i),""" onClick")-10)<br />&#160;tempLink=replace(tempLink,"../","")<br />&#160;response.Write(i&":"&tempLink&"<br>")<br />&#160;NewsContent=ReadXml(tempLink,"gb2312","<td valign=""bottom"" width=""400"">","<hr width=""760"" noshade size=""1"" color=""#808080""> ")<br />&#160;NewsContent=RemoveHtml(NewsContent)<br />&#160;NewsContent=replace(NewsContent,VbCrLf,"")<br />&#160;NewsContent=replace(NewsContent,vbNewLine,"")<br />&#160;NewsContent=replace(NewsContent,"&#160;","")<br />&#160;NewsContent=replace(NewsContent," ","")<br />&#160;NewsContent=replace(NewsContent,"&nbsp;","")&#160;<br />&#160;NewsContent=replace(NewsContent,"\n","")&#160;<br />&#160;NewsContent=replace(NewsContent,chr(10),"")<br />&#160;NewsContent=replace(NewsContent,chr(13),"")<br />&#160;'===============get Content=======================<br />&#160;response.Write(NewsContent)<br />&#160;KeyId=SubStr(NewsContent,"列号:","信息类别:")<br />&#160;NewsClass=SubStr(NewsContent,"类别:","所在城市:")<br />&#160;City=SubStr(NewsContent,"城市:","房屋具体位置:")<br />&#160;Position=SubStr(NewsContent,"位置:","房屋类型:")<br />&#160;HouseType=SubStr(NewsContent,"类型:","楼层:")<br />&#160;Level=SubStr(NewsContent,"楼层:","使用面积:")<br />&#160;Area=SubStr(NewsContent,"面积:","房价:")<br />&#160;Price=SubStr(NewsContent,"房价:","其他说明:")<br />&#160;Demostra=SubStr(NewsContent,"说明:","联系人:")<br />&#160;ContactMan=SubStr(NewsContent,"联系人:","联系方式:")<br />&#160;Contact=SubStr(NewsContent,"联系方式:","信息来源:")&#160;<br />&#160;response.Write("总序列号:"&KeyId&"<br>")<br />&#160;response.Write("信息类别:"&NewsClass&"<br>")<br />&#160;response.Write("所在城市:"&City&"<br>")<br />&#160;response.Write("房屋具体位置:"&Position&"<br>")<br />&#160;response.Write("房屋类型:"&HouseType&"<br>")<br />&#160;response.Write("楼层:"&Level&"<br>")<br />&#160;response.Write("使用面积:"&Area&"<br>")<br />&#160;response.Write("房价:"&Price&"<br>")<br />&#160;response.Write("其他说明:"&Demostra&"<br>")<br />&#160;response.Write("联系人:"&ContactMan&"<br>")<br />&#160;response.Write("联系方式:"&Contact&"<br>")<br />&#160;'title=RemoveHTML(aa(i))<br />&#160;'response.Write("title:"&title)<br />&#160;for n=0 to application.Contents.count<br /> &#160;if(application.Contents(n)=KeyId) then<br />  ifexit=true  &#160;<br /> &#160;end if &#160;<br />&#160;next <br />&#160;if not ifexit then<br /> &#160;application(time&i)=KeyId<br />&#160;'添加到数据库<br />&#160;'====================================================<br />&#160;set rs=server.CreateObject("adodb.recordset")<br />&#160;rs.open "select top 1 * from news order by id desc",conn,3,3<br />&#160;rs.addnew<br />&#160;rs("NewsClass")=NewsClass<br />&#160;rs("City")=City<br />&#160;rs("Position")=Position<br />&#160;rs("HouseType")=HouseType<br />&#160;rs("Level")=Level<br />&#160;rs("Area")=Area<br />&#160;rs("Price")=Price<br />&#160;rs("Demostra")=Demostra<br />&#160;rs("ContactMan")=ContactMan<br />&#160;rs("Contact")=Contact<br />&#160;rs.update<br />&#160;rs.close<br />&#160;set rs=nothing<br />&#160;end if<br />&#160;'==================================================<br />&#160;<br />next<br />function RemoveTag(body)<br />&#160;Set regEx = New RegExp<br />&#160;regEx.Pattern = "<[a].*?<\/[a]>"<br />&#160;regEx.IgnoreCase = True<br />&#160;regEx.Global = True<br />&#160;Set Matches = regEx.Execute(body)&#160;<br />&#160;dim i,arr(15),ifexit<br />&#160;i=0<br />&#160;j=0<br />&#160;For Each Match in Matches<br /> TempStr = Match.Value <br /> TempStr=replace(TempStr,"<td>","")<br /> TempStr=replace(TempStr,"</td>","")<br /> TempStr=replace(TempStr,"<tr>","")<br /> TempStr=replace(TempStr,"</tr>","") <br /> arr(i)=TempStr <br /> i=i+1<br /> if(i>=15) then<br /> &#160;exit for<br /> end if<br />&#160;Next<br />&#160;Set regEx=nothing<br />&#160;Set Matches =nothing<br />&#160;RemoveTag=arr<br />&#160;<br />end function<br />function RegexHtml(body)<br />&#160;dim r_arr(47),r_temp<br />&#160;Set regEx2 = New RegExp<br />&#160;regEx2.Pattern ="<a.*?<\/a>"<br />&#160;regEx2.IgnoreCase = True<br />&#160;regEx2.Global = True<br />&#160;Set Matches2 = regEx2.Execute(body)&#160;<br />&#160;iii=0&#160;<br />&#160;For Each Match in Matches2<br /> r_arr(iii)=Match.Value<br /> iii=iii+1 <br />&#160;Next<br />&#160;RegexHtml=r_arr<br />&#160;set regEx2=nothing<br />&#160;set Matches2=nothing<br />end function<br />'======================================================<br />conn.close<br />set conn=nothing<br />%><br /></body><br /></html><br /></code></p><p>  function.asp</p><code><br />&#160;<%<br />'**************************************************<br />'函数名:gotTopic<br />'作 用:截字符串,汉字一个算两个字符,英文算一个字符<br />'参 数:str  ----原字符串<br />'    strlen ----截取长度<br />'返回值:截取后的字符串<br />'**************************************************<br />function gotTopic(str,strlen)<br />&#160;if str="" then<br /> gotTopic=""<br /> exit function<br />&#160;end if<br />&#160;dim l,t,c, i<br />&#160;str=replace(replace(replace(replace(str,"&nbsp;"," "),"&quot;",chr(34)),"&gt;",">"),"&lt;","<")<br />&#160;str=replace(str,"?","")<br />&#160;l=len(str)<br />&#160;t=0<br />&#160;for i=1 to l<br /> c=Abs(Asc(Mid(str,i,1)))<br /> if c>255 then<br /> &#160;t=t+2<br /> else<br /> &#160;t=t+1<br /> end if<br /> if t>=strlen then<br /> &#160;gotTopic=left(str,i) & "…"<br /> &#160;exit for<br /> else<br /> &#160;gotTopic=str<br /> end if<br />&#160;next<br />&#160;gotTopic=replace(replace(replace(replace(gotTopic," ","&nbsp;"),chr(34),"&quot;"),">","&gt;"),"<","&lt;")<br />end function<br />'=========================================================<br />'函数:RemoveHTML(strHTML)<br />'功能:去除HTML标记<br />'参数:strHTML --要去除HTML标记的字符串<br />'=========================================================<br />Function RemoveHTML(strHTML)<br />Dim objRegExp, Match, Matches<br />Set objRegExp = New Regexp<br />objRegExp.IgnoreCase = True<br />objRegExp.Global = True<br />'取闭合的<><br />objRegExp.Pattern = "<.+?>"<br />'进行匹配<br />Set Matches = objRegExp.Execute(strHTML)<br />' 遍历匹配集合,并替换掉匹配的项目<br />For Each Match in Matches<br />strHtml=Replace(strHTML,Match.Value,"")<br />Next<br />RemoveHTML=strHTML<br />Set objRegExp = Nothing<br />set Matches=nothing<br />End Function<br />%><br /></code><p>  conn.asp</p><code><br />&#160;<%<br />'on error resume next<br />set conn=server.CreateObject("adodb.connection")<br />con= "driver={Microsoft Access Driver (*.mdb)};dbq=" & Server.MapPath("stest.mdb")<br />conn.open con<br />sub connclose&#160;<br />  conn.close<br />  set conn=nothing <br />end sub<br />%><br /></code><p>  附:抓取信息的详细页面事例</p><table cellspacing="1" cellpadding="0"><tr><td><p>  总序列号:</p></td><td><p>  479280 </p></td></tr><tr><td><p>  信息类别:</p></td><td><p>  出租</p></td></tr><tr><td><p>  所在城市:</p></td><td><p>  济南</p></td></tr><tr><td><p>  房屋具体位置:</p></td><td><p>  华龙路华信路交界口</p></td></tr><tr><td><p>  房屋类型:</p></td><td><p>  其他</p></td></tr><tr><td><p>  楼层:</p></td><td><p>  六层</p></td></tr><tr><td><p>  使用面积:</p></td><td><p>  24~240 平方米之间</p></td></tr><tr><td><p>  房价:</p></td><td><p>  0  [租赁:元/月,买卖:万元/套]</p></td></tr><tr><td><p>  其他说明:</p></td><td><p>  华信商务楼3至6层小空间对外出租(0.5元/平起),本楼属纯商务办公投资使用,可用于办公写字间,周边设施齐全、交通便利(37、80、K95在本楼前经过),全产权、市证,楼内设施包括水、电、暖、电梯设施齐全,有意者可电讯!</p></td></tr><tr><td><p>  联系人:</p></td><td><p>  鲁、王</p></td></tr><tr><td><p>  联系方式:</p></td><td><p>  88017966、86812217 </p></td></tr><tr><td><p>  信息来源:</p></td><td><p>  2005-8-4 8:28:55 来自:218.98.86.175</p></td></tr><tr><td><p>  点击次数:</p></td><td><p>  19</p></td></tr></table>