实例讲解ASP实现抓取网上房产信息 - 呼啦飞学习日记~尤有窝爱

   <p>　　<code><br />&#160;<%@LANGUAGE="VBSCRIPT" CODEPAGE="936"%><br /><!-- #include file="conn.asp" --><br /><!-- #include file="inc/function.asp" --><br /><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><br /><html><br /><head><br /><title>Untitled Document</title><br /><meta http-equiv="Content-Type" content="text/html; charset=gb2312"><br /><meta http-equiv="refresh" content="300;URL=steal_house.asp"><br /></head><br /><body><br /><%<br />on error resume next<br />'<br />Server.ScriptTimeout = 999999<br />'========================================================<br />'字符编码函数<br />'====================================================<br />Function BytesToBstr(body,code)<br />　　　　dim objstream<br />　　　　set objstream = Server.CreateObject("adodb.stream")<br />　　　　objstream.Type = 1<br />　　　　objstream.Mode =3<br />　　　　objstream.Open<br />　　　　objstream.Write body<br />　　　　objstream.Position = 0<br />　　　　objstream.Type = 2<br />　　　　objstream.Charset =code<br />　　　　BytesToBstr = objstream.ReadText&#160;<br />　　　　objstream.Close<br />　　　　set objstream = nothing<br />End Function<br />'取行字符串在另一字符串中的出现位置<br />Function Newstring(wstr,strng)<br />　　　　Newstring=Instr(lcase(wstr),lcase(strng))<br />　　　　if Newstring<=0 then Newstring=Len(wstr)<br />End Function<br />'替换字符串函数<br />function ReplaceStr(ori,str1,str2)<br />ReplaceStr=replace(ori,str1,str2)<br />end function<br />'====================================================<br />function ReadXml(url,code,start,ends)<br />set oSend=createobject("Microsoft.XMLHTTP")<br />SourceCode = oSend.open ("GET",url,false)<br />oSend.send()<br />ReadXml=BytesToBstr(oSend.responseBody,code )<br />start=Instr(ReadXml,start)<br />ReadXml=mid(ReadXml,start)<br />ends=Instr(ReadXml,ends)<br />ReadXml=left(ReadXml,ends-1)<br />end function<br />function SubStr(body,start,ends)<br />start=Instr(body,start)<br />SubStr=mid(body,start+len(start)+1)<br />ends=Instr(SubStr,ends)<br />SubStr=left(SubStr,ends-1)<br />end function<br />dim getcont,NewsContent<br />dim url,title<br />url="http://www.***.com"'新闻网址<br />getcont=ReadXml(url,"gb2312","<table class=k2 border=""0""","</table>")<br />getcont=RegexHtml(getcont)<br />dim KeyId,NewsClass,City,Position,HouseType,Level,Area,Price,Demostra<br />dim ContactMan,Contact<br />for i=2 to ubound(getcont)<br />&#160;response.Write(getcont(i)&"__<br>")<br />&#160;<br />&#160;tempLink=mid(getcont(i),instr(getcont(i),"href="" mce_href=""")+6,instr(getcont(i),""" onClick")-10)<br />&#160;tempLink=replace(tempLink,"../","")<br />&#160;response.Write(i&":"&tempLink&"<br>")<br />&#160;NewsContent=ReadXml(tempLink,"gb2312","<td valign=""bottom"" width=""400"">","<hr width=""760"" noshade size=""1"" color=""#808080""> ")<br />&#160;NewsContent=RemoveHtml(NewsContent)<br />&#160;NewsContent=replace(NewsContent,VbCrLf,"")<br />&#160;NewsContent=replace(NewsContent,vbNewLine,"")<br />&#160;NewsContent=replace(NewsContent,"&#160;","")<br />&#160;NewsContent=replace(NewsContent," ","")<br />&#160;NewsContent=replace(NewsContent,"&nbsp;","")&#160;<br />&#160;NewsContent=replace(NewsContent,"\n","")&#160;<br />&#160;NewsContent=replace(NewsContent,chr(10),"")<br />&#160;NewsContent=replace(NewsContent,chr(13),"")<br />&#160;'===============get Content=======================<br />&#160;response.Write(NewsContent)<br />&#160;KeyId=SubStr(NewsContent,"列号：","信息类别：")<br />&#160;NewsClass=SubStr(NewsContent,"类别：","所在城市：")<br />&#160;City=SubStr(NewsContent,"城市：","房屋具体位置：")<br />&#160;Position=SubStr(NewsContent,"位置：","房屋类型：")<br />&#160;HouseType=SubStr(NewsContent,"类型：","楼层：")<br />&#160;Level=SubStr(NewsContent,"楼层：","使用面积：")<br />&#160;Area=SubStr(NewsContent,"面积：","房价：")<br />&#160;Price=SubStr(NewsContent,"房价：","其他说明：")<br />&#160;Demostra=SubStr(NewsContent,"说明：","联系人：")<br />&#160;ContactMan=SubStr(NewsContent,"联系人：","联系方式：")<br />&#160;Contact=SubStr(NewsContent,"联系方式：","信息来源：")&#160;<br />&#160;response.Write("总序列号:"&KeyId&"<br>")<br />&#160;response.Write("信息类别:"&NewsClass&"<br>")<br />&#160;response.Write("所在城市:"&City&"<br>")<br />&#160;response.Write("房屋具体位置:"&Position&"<br>")<br />&#160;response.Write("房屋类型:"&HouseType&"<br>")<br />&#160;response.Write("楼层:"&Level&"<br>")<br />&#160;response.Write("使用面积:"&Area&"<br>")<br />&#160;response.Write("房价:"&Price&"<br>")<br />&#160;response.Write("其他说明:"&Demostra&"<br>")<br />&#160;response.Write("联系人:"&ContactMan&"<br>")<br />&#160;response.Write("联系方式:"&Contact&"<br>")<br />&#160;'title=RemoveHTML(aa(i))<br />&#160;'response.Write("title:"&title)<br />&#160;for n=0 to application.Contents.count<br />　&#160;if(application.Contents(n)=KeyId) then<br />　　ifexit=true　　&#160;<br />　&#160;end if　&#160;<br />&#160;next　<br />&#160;if not ifexit then<br />　&#160;application(time&i)=KeyId<br />&#160;'添加到数据库<br />&#160;'====================================================<br />&#160;set rs=server.CreateObject("adodb.recordset")<br />&#160;rs.open "select top 1 * from news order by id desc",conn,3,3<br />&#160;rs.addnew<br />&#160;rs("NewsClass")=NewsClass<br />&#160;rs("City")=City<br />&#160;rs("Position")=Position<br />&#160;rs("HouseType")=HouseType<br />&#160;rs("Level")=Level<br />&#160;rs("Area")=Area<br />&#160;rs("Price")=Price<br />&#160;rs("Demostra")=Demostra<br />&#160;rs("ContactMan")=ContactMan<br />&#160;rs("Contact")=Contact<br />&#160;rs.update<br />&#160;rs.close<br />&#160;set rs=nothing<br />&#160;end if<br />&#160;'==================================================<br />&#160;<br />next<br />function RemoveTag(body)<br />&#160;Set regEx = New RegExp<br />&#160;regEx.Pattern = "<[a].*?<\/[a]>"<br />&#160;regEx.IgnoreCase = True<br />&#160;regEx.Global = True<br />&#160;Set Matches = regEx.Execute(body)&#160;<br />&#160;dim i,arr(15),ifexit<br />&#160;i=0<br />&#160;j=0<br />&#160;For Each Match in Matches<br />　TempStr = Match.Value　<br />　TempStr=replace(TempStr,"<td>","")<br />　TempStr=replace(TempStr,"</td>","")<br />　TempStr=replace(TempStr,"<tr>","")<br />　TempStr=replace(TempStr,"</tr>","")　<br />　arr(i)=TempStr　<br />　i=i+1<br />　if(i>=15) then<br />　&#160;exit for<br />　end if<br />&#160;Next<br />&#160;Set regEx=nothing<br />&#160;Set Matches =nothing<br />&#160;RemoveTag=arr<br />&#160;<br />end function<br />function RegexHtml(body)<br />&#160;dim r_arr(47),r_temp<br />&#160;Set regEx2 = New RegExp<br />&#160;regEx2.Pattern ="<a.*?<\/a>"<br />&#160;regEx2.IgnoreCase = True<br />&#160;regEx2.Global = True<br />&#160;Set Matches2 = regEx2.Execute(body)&#160;<br />&#160;iii=0&#160;<br />&#160;For Each Match in Matches2<br />　r_arr(iii)=Match.Value<br />　iii=iii+1　<br />&#160;Next<br />&#160;RegexHtml=r_arr<br />&#160;set regEx2=nothing<br />&#160;set Matches2=nothing<br />end function<br />'======================================================<br />conn.close<br />set conn=nothing<br />%><br /></body><br /></html><br /></code></p><p>　　function.asp</p><code><br />&#160;<%<br />'**************************************************<br />'函数名：gotTopic<br />'作　用：截字符串，汉字一个算两个字符，英文算一个字符<br />'参　数：str　 ----原字符串<br />'　　　 strlen ----截取长度<br />'返回值：截取后的字符串<br />'**************************************************<br />function gotTopic(str,strlen)<br />&#160;if str="" then<br />　gotTopic=""<br />　exit function<br />&#160;end if<br />&#160;dim l,t,c, i<br />&#160;str=replace(replace(replace(replace(str,"&nbsp;"," "),"&quot;",chr(34)),"&gt;",">"),"&lt;","<")<br />&#160;str=replace(str,"?","")<br />&#160;l=len(str)<br />&#160;t=0<br />&#160;for i=1 to l<br />　c=Abs(Asc(Mid(str,i,1)))<br />　if c>255 then<br />　&#160;t=t+2<br />　else<br />　&#160;t=t+1<br />　end if<br />　if t>=strlen then<br />　&#160;gotTopic=left(str,i) & "…"<br />　&#160;exit for<br />　else<br />　&#160;gotTopic=str<br />　end if<br />&#160;next<br />&#160;gotTopic=replace(replace(replace(replace(gotTopic," ","&nbsp;"),chr(34),"&quot;"),">","&gt;"),"<","&lt;")<br />end function<br />'=========================================================<br />'函数：RemoveHTML(strHTML)<br />'功能：去除HTML标记<br />'参数：strHTML　--要去除HTML标记的字符串<br />'=========================================================<br />Function RemoveHTML(strHTML)<br />Dim objRegExp, Match, Matches<br />Set objRegExp = New Regexp<br />objRegExp.IgnoreCase = True<br />objRegExp.Global = True<br />'取闭合的<><br />objRegExp.Pattern = "<.+?>"<br />'进行匹配<br />Set Matches = objRegExp.Execute(strHTML)<br />' 遍历匹配集合，并替换掉匹配的项目<br />For Each Match in Matches<br />strHtml=Replace(strHTML,Match.Value,"")<br />Next<br />RemoveHTML=strHTML<br />Set objRegExp = Nothing<br />set Matches=nothing<br />End Function<br />%><br /></code><p>　　conn.asp</p><code><br />&#160;<%<br />'on error resume next<br />set conn=server.CreateObject("adodb.connection")<br />con= "driver={Microsoft Access Driver (*.mdb)};dbq=" & Server.MapPath("stest.mdb")<br />conn.open con<br />sub connclose&#160;<br />　 conn.close<br />　 set conn=nothing　<br />end sub<br />%><br /></code><p>　　附：抓取信息的详细页面事例</p><table cellspacing="1" cellpadding="0"><tr><td><p>　　总序列号：</p></td><td><p>　　479280　</p></td></tr><tr><td><p>　　信息类别：</p></td><td><p>　　出租</p></td></tr><tr><td><p>　　所在城市：</p></td><td><p>　　济南</p></td></tr><tr><td><p>　　房屋具体位置：</p></td><td><p>　　华龙路华信路交界口</p></td></tr><tr><td><p>　　房屋类型：</p></td><td><p>　　其他</p></td></tr><tr><td><p>　　楼层：</p></td><td><p>　　六层</p></td></tr><tr><td><p>　　使用面积：</p></td><td><p>　　24～240 平方米之间</p></td></tr><tr><td><p>　　房价：</p></td><td><p>　　0 　[租赁:元/月,买卖:万元/套]</p></td></tr><tr><td><p>　　其他说明：</p></td><td><p>　　华信商务楼3至6层小空间对外出租（0.5元/平起），本楼属纯商务办公投资使用，可用于办公写字间，周边设施齐全、交通便利（37、80、K95在本楼前经过），全产权、市证，楼内设施包括水、电、暖、电梯设施齐全，有意者可电讯！</p></td></tr><tr><td><p>　　联系人：</p></td><td><p>　　鲁、王</p></td></tr><tr><td><p>　　联系方式：</p></td><td><p>　　88017966、86812217 </p></td></tr><tr><td><p>　　信息来源：</p></td><td><p>　　2005-8-4 8:28:55　来自：218.98.86.175</p></td></tr><tr><td><p>　　点击次数：</p></td><td><p>　　19</p></td></tr></table>