python初学者pycharm详细教程pdf下载
def cUrl(self,full_url):
#获取内容
c_url = urllib.request.urlopen(full_url)
cont = c_url.read()
cont_soup = BeautifulSoup(cont,"html.parser")
cont_p = cont_soup.find('article')
return self.transferContent(cont_p)
通过上函授获取页面中article标签的内容
执行下面sql语句把获取的内容插入数据库
cur.execute("insert into p_links(title,href,content) values ('%s','%s','%s')" % (titleContents,full_url,cont_p))
执行sql时,提示错误,发现是因为cont_p中的单引号和双引号引起的
于是使用如下函数转义引号
def transferContent(self, content):
if content is None:
return None
else:
stri = ""
for c in content:
if c == '"':
stri += c.replace('"','\\\"')
elif c == "'":
stri += c.replace("'","\\\'")
elif c == "\\":
stri += "\\\\"
else:
stri += str(c)
return stri
发现转义不成功
于是打印cont_p类型
print(type(cont_p))
结果是如下类型
才发现是bs4自定义的Tag类型,而我的转义函数是针对字符串的
所以进行一步转化操作cont_p = str(cont_p)
修改函数如下:
def cUrl(self,full_url):
#获取内容
c_url = urllib.request.urlopen(full_url)
cont = c_url.read()
cont_soup = BeautifulSoup(cont,"html.parser")
cont_p = cont_soup.find('article')
cont_p = str(cont_p) #这里转化为字符串类型
return self.transferContent(cont_p)
完整代码如下:
文件1:spider.py
from bs4 import BeautifulSoup
class cSpider:
def cUrl(self,full_url):
#获取内容
c_url = urllib.request.urlopen(full_url)
cont = c_url.read()
cont_soup = BeautifulSoup(cont,"html.parser")
cont_p = cont_soup.find('article')
cont_p = str(cont_p) #这里转化为字符串类型
return self.transferContent(cont_p)
def transferContent(self, content):
if content is None:
return None
else:
stri = ""
for c in content:
if c == '"':
stri += c.replace('"','\\\"')
elif c == "'":
stri += c.replace("'","\\\'")
elif c == "\\":
stri += "\\\\"
else:
stri += str(c)
return stri
文件2:contentSpider.py
import urllib
from urllib import request,parse
from bs4 import BeautifulSoup
import re
import pymysql
from contentSpider import cSpider
#链接mysql
db = pymysql.Connect(host="localhost",user="root",password="root",db="python",charset='utf8')
cur = db.cursor(cursor=pymysql.cursors.DictCursor)
#获取页面
base_url = "http://www.3qphp.com/php/index.html"
urlb = urllib.request.urlopen(base_url)
cent = urlb.read()
#解析出链接地址
soup = BeautifulSoup(cent,"html.parser")
#print(soup)
slink = soup.find_all("a",href=re.compile(r"\/php\/(.+?)\/(\d+).html"))
#获取到所有,所有符合条件的链接,然后过滤重复的
# count = 0
for link in slink:
if 'class' not in link.attrs and link.contents[0].name != 'span':
full_url = urllib.parse.urljoin(base_url,link['href']) #获取标题url
titleContents = link.contents[0].replace("\'","\"") #获取标题
#获取内容
cS = cSpider()
cont_p = cS.cUrl(full_url)
try:
cur.execute("insert into p_links(title,href,content) values ('%s','%s','%s')" % (titleContents,full_url,cont_p))
except Exception as err:
print(err)
db.commit()
转载请注明:谷谷点程序 » Python3 爬虫 BeautifulSoup模块(4): bs4 Tag类型转换为字符串 insert插入数据错误