11 imgtmp=re.compile(\ self.partimgs_show.add(imgtmp) 7 1 elif imgtmp_flag!='no' and imgtmp!='': 8 self.partimgs.add(imgtmp)# 11 def handle_data(self, data): if self.processing: 9 1 self.data+=data 2 if self.flag['title']=='match':#获取成功 0 12 self.title=data if self.flag['jdprice']=='match': self.jdprice=data.strip() 1 1 if self.flag['typeOrsize']=='match': 21 self.typeOrsize.add(data.strip()) if self.flag['refprice']=='match': 2 2 self.refprice=data.strip() 3 if self.flag['partdetail']=='match' and re.search(r':12',data):#获取成功 keytmp=data.split(\:\ 4 1 valuetmp=data.split(\:\ 2 self.partdetail[keytmp]=valuetmp 5 12 if self.flag['specification']=='match' and data.strip() != '' and data.strip()!='主体': self.specification.append(data.strip()) 6 1 else: 21 pass def handle_endtag(self, tag): 7 2 if tag==self.processing: 8 self.processing=None 12 def getdata(self): return {'title':self.title,'partimgs_show':self.partimgs_sh9 1ow,'jdprice':self.jdprice,'refprice':self.refprice,'partimgs':self.3partimgs,'partdetail':self.partdetail,'specification':self.specific0 13ation,'typeOrsize':self.typeOrsize} #定义方法httpread,用于发起http的get请求,返回http的获取内容 #这也是代码抽象的结果,如若不抽象这块代码出来,后续你回发现很多重复1 13的写这块代码 def httpread(host,url,headers): 2 1 httprestmp='' 3 try: 3 13 conn = httplib.HTTPConnection(host) conn.request('GET',url,None,headers) httpres = conn.getresponse() 4 1 httprestmp=httpres.read() 31 except Exception,e: conn = httplib.HTTPConnection(host) 5 3 conn.request('GET',url,None,headers) 6 httpres = conn.getresponse() 13 httprestmp=httpres.read() print e 7 1 finally: 3 if conn: 8 13 conn.close() return httprestmp sendhttp,调用httpread,获取结果并替换编码(gbk换为utf-8),#定义方法9 411并保存到文件中(以免下次再去下载页面,这样就节省了时间) # def sendhttp(url,host,savefile): http头部,很多网站对于你不携带User-Agent及Referer等情况,0 4 #定义141 是不允许你爬取。 #具体的http的头部有些啥信息,你可以看chrome,右键审查元素,点击network,点击其中一个链接,查看request header 2 1 headers = {\ 4 \ 3 14 \ \ charset=UTF-8\ 4 1 \ 41 \Gecko) Chrome/3.0.w4.\ 5 4 \6 14com|utmccn=(refrral)|utmcmd=rferral|utmcct=/order/getnfo.action; _pst=xx89; pin=x9; unick=jaa; cshi3.com=D6045EA24A6FB9; _tp=sdyuew8r9e7r9oxr3245==; user-key=1754; cn=0; ipLocation=缌; ipLoc97;7 1 areaId=1; mt_ext2:'d; aview=6770.106|68|5479.665|675.735|6767.41100|6757.13730|6ee.9ty711|1649.10440; atw=65.15.325.24353.-4|188.3424.-10|22; __j34|72.2234; __jdc=2343423; __jdve|-; __jdu=3434\ 8 4 } 9 httprestmp='' 15 try: httprestmp=httpread(host,url,headers) 0 1 if httprestmp=='':# 5 httprestmp=httpread(host,url,headers) 1 15 if httprestmp=='':#重试2次 httprestmp=httpread(host,url,headers) except Exception,e: 2 1 try: 51 httprestmp=httpread(host,url,headers) if httprestmp=='':# 2次 3 5 httprestmp=httpread(host,url,headers) 4 if httprestmp=='':#重试15 httprestmp=httpread(host,url,headers) except Exception,e: 5 1 print e 5 print e 6 15 if re.search(r'charset=gb2312',httprestmp):#如果是gb2312得编码,就要转码为utf-8(因为全局都使用了utf-8) httprestmp.replace(\ 7 1 try: 51 httprestmp=httprestmp.decode('gbk').encode('utf-8')#有可能转码失败,所以要加上try 8 5 except Exception,e:#如果html编码本来就是utf8或者转换编码出9 错的时候,就啥都不做,就用原始内容 16 print e try: 0 1 with open(savefile, 'w') as file_object: 61 file_object.write(httprestmp) file_object.flush() 1 6 except Exception,e: 2 print e 16 return httprestmp #list的页面的解析方法 3 1def parseListpageurl(listpageurl): 6 urlobj=urlparse(listpageurl) 4 16 if urlobj.query: geturl=urlobj.path+\ else: 5 1 geturl=urlobj.path 61 htmlfile=\ if not os.path.exists(htmlfile): 6 6 httpresult=sendhttp(geturl,urlobj.hostname,htmlfile) 7 with open(htmlfile) as file: 16 htmlcontent=file.read() parser= ListPageParser()#声明一个解析对象 8 1 #http://list.jd.com/list.html?cat=737,794,870&page=611&JL=6_0_0,所以这里需要把'amp;'去掉 9 17 parser.feed(htmlcontent.replace('amp;',''))#将html的内容feed进去 get数据即可 #print 'debug:htmlcontent',htmlcontent 0 1 finalparseurl=parser.getlinks()#然后71 lastpageurl=parser.getlasturl() urlobj_lastpageurl=urlparse(lastpageurl) 1 7 #print 'debug:urlobj_lastpageurl',urlobj_lastpageurl 2 totalPageNo='0' 17 #print urlobj if re.search(r'&',urlobj_lastpageurl.query): 3 1 try: 7 totalPageNo=urlobj_lastpageurl.query.split(\4 17split(\获得总共有多少页 except Exception,e: print \ 5
相关推荐: