(毕设1)爬虫+mysql+flask+echarts实现网站数据可视化(附源码) 您所在的位置:网站首页 基于python网络爬虫毕业设计 (毕设1)爬虫+mysql+flask+echarts实现网站数据可视化(附源码)

(毕设1)爬虫+mysql+flask+echarts实现网站数据可视化(附源码)

2023-10-28 09:18| 来源: 网络整理| 查看: 265

目录

1. 项目要求与内容

 2.数据爬取

2.1分析url,网页源码

2.2编写代码

2.3 数据清洗

3.数据存储

3.1 mysql中需要建立的6张表

3.2 建表语句

3.3将2中清洗后的数据通过navicat导入

4.flask web网站的搭建

4.1 使用pycharm建一个flask项目

4.2 后端

4.3 前端

4.4项目框架

5.项目展示

6.源码 点击

1. 项目要求与内容

利用python爬取数据并进行清洗和预处理,将清洗后的数据存到mysql数据库中,后端利用flask框架,用Echarts实现数据可视化。

1.2完成项目基本内容:

爬取豆瓣读书top250网页上相关信息; 对爬取保存下来的数据文件进行清洗和预处理; 将清洗好的数据导入数据库; 进行需求分析,对要实现的数据可视化效果进行分析,创建相应的数据库表并导入数据; 选用python语言进行后端管理: 2. 开发工具及相关技术 pycharm navicat mysql

 2.数据爬取 2.1分析url,网页源码 2.2编写代码 # 1. 导入库包 import requests from lxml import etree from time import sleep import os import pandas as pd import re BOOKS = [] IMGURLS = [] # 2. 获取网页源代码 def get_html(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } # 异常处理 try: html = requests.get(url, headers=headers) # 声明编码方式 html.encoding = html.apparent_encoding # 判断 if html.status_code == 200: print('成功获取源代码') # print(html.text) except Exception as e: print('获取源代码失败:%s' % e) # 返回html return html.text # 3. 解析网页源代码 def parse_html(html): html = etree.HTML(html) # 每个图书信息分别保存在 class="indent" 的div下的 table标签内 tables = html.xpath("//div[@class='indent']//table") # print(len(tables)) # 打印之后如果是25的话就是对的 books = [] imgUrls = [] # 遍历通过xpath得到的li标签列表 # 因为要获取标题文本,所以xpath表达式要追加 /text(), t.xpath返回的是一个列表,且列表中只有一个元素所以追加一个[0] for t in tables: # title = t.xpath(".//div[@class='p12']/a/@title") # 匹配得到的是空的 # 书名 title = t.xpath(".//td[@valign='top']//a/@title")[0] # 链接 link = t.xpath(".//td[@valign='top']//a/@href")[0] # 获取pl标签的字符串 pl = t.xpath(".//td[@valign='top']//p[1]/text()")[0] # 截取国家 if '[' in pl: country = pl.split('[')[1].split(']')[0] else: country = '中' # 没有国家的默认为“中国” # 截取作者 if '[' in pl: author = pl.split(']')[1].split('/')[0].replace(" ", "") elif len(pl.split('/')) == 3: author = '无' elif len(pl.split('/')) == 2: author = pl.split('/')[0] elif '[' not in pl: if len(pl.split('/')) == 4: author = pl.split('/')[-4] elif len(pl.split('/')) == 5: author = pl.split('/')[-5] elif len(pl.split('/')) == 6: author = pl.split('/')[-6] else: author = '无' # 截取翻译者 if len(pl.split('/')) == 3: translator = ' ' elif '[' in pl: if len(pl.split('/')) == 4: translator = pl.split('/')[-3] elif len(pl.split('/')) == 5: translator = pl.split('/')[-4] elif len(pl.split('/')) == 6: translator = pl.split('/')[-5] else: translator = ' ' # 截取出版社 if len(pl.split('/')) == 2: publisher=pl.split('/')[0] elif len(pl.split('/'))== 3: publisher = pl.split('/')[0] elif '[' in pl: if len(pl.split('/'))== 4: publisher = pl.split('/')[1] elif len(pl.split('/')) == 5: publisher = pl.split('/')[2] elif len(pl.split('/')) == 6: publisher = pl.split('/')[-3] elif len(pl.split('/')) == 7: publisher = pl.split('/')[-4] elif '[' not in pl: #if len(pl.split('/'))== 3: publisher = pl.split('/')[-3] # if len(pl.split('/')) == 6: # publisher = pl.split('/')[-3] # elif len(pl.split('/')) == 7: # publisher = pl.split('/')[-4] # 截取出版时间 if len(pl.split('/')) == 2: time= '不详' elif len(pl.split('/')) == 4: time = pl.split('/')[-2] elif len(pl.split('/')) == 5: time = pl.split('/')[-2] elif len(pl.split('/')) == 6: time = pl.split('/')[-2] # 截取单价 if '元' in pl: price = pl.split('/')[-1].split('元')[0] else: price = pl.split('/')[-1] # 获取星级数 str1 = t.xpath(".//td[@valign='top']//div[@class='star clearfix']/span[1]/@class")[0].replace("allstar", "") # 此时获取到的数字其实是字符串类型,不能直接%10,需要把str转化为int num = int(str1) star = num / 10 # 获取评分 score = t.xpath(".//td[@valign='top']//div[@class='star clearfix']/span[2]/text()")[0] # 获取评价人数 pnum = t.xpath(".//td[@valign='top']//div[@class='star clearfix']/span[3]/text()")[0] people = re.sub("\D", "", pnum) # 获取简介 comments = t.xpath(".//p[@class='quote']/span/text()") comment = comments[0] if len(comments) != 0 else "无" book = { '书名': title, '链接': link, '国家': country, '作者': author, '翻译者': translator, '出版社': publisher, '出版时间': time, '价格': price, '星级': star, '评分': score, '评价人数': people, '简介': comment } # 图片 imgUrl = t.xpath(".//a/img/@src")[0] # print(imgUrl) books.append(book) imgUrls.append(imgUrl) return books, imgUrls # 4. 下载图片保存文件 def downloadimg(url, book): # 判断文件夹是否在指定路径下面,建立文件夹并把指定路径移到文件夹下面 if 'img' in os.listdir(r'D:\pachong'): pass else: os.mkdir(r'D:\pachong\img') os.chdir(r'D:\pachong\img') # 返回img的二进制流 img = requests.request('GET', url).content with open(book['书名'] + '.jpg', 'wb') as f: # print('正在下载: %s' % url) f.write(img) # 5. 数据预处理 # def processData(): if __name__ == '__main__': # url = 'https://book.douban.com/top250?start=0' # 10页循环遍历 for i in range(10): # 2. 定义url并获取网页源代码 url = 'https://book.douban.com/top250?start={}'.format(i * 25) # print(url) html = get_html(url) # 3. 解析网页源代码 sleep(1) books = parse_html(html)[0] imgUrls = parse_html(html)[1] BOOKS.extend(books) IMGURLS.extend(imgUrls) # 4. 下载图片保存文件 # for i in range(250): # # sleep(1) # downloadimg(IMGURLS[i], BOOKS[i]) os.chdir(r'D:/pachong/img') # 以csv格式写入本地 bookdata = pd.DataFrame(BOOKS) bookdata.to_csv('D:/pachong/book.csv', index=False) print("图书信息写入本地成功") # 以txt格式写入本地错误 # 得到的是字典格式,要想写成txt格式需要先转化成字符串格式 # for i in range(25): # with open('book.txt', 'a') as f: # f.write(books[i] + '\n')

2.3 数据清洗 #数据清洗 import pandas as pd import re import time # 先读取数据文件 data = pd.read_csv('D:/pachong/book.csv') result = pd.DataFrame(data) a = result.dropna(axis=0, how='any') pd.set_option('display.max_rows', None) #输出全部行,不省略 b = u'数据' number = 1 b1 = '1981-8' li1 = a['出版社'] for i in range(0, len(li1)): try: if b1 in li1[i]: # print(number,li1[i]) number += 1 a = a.drop(i, axis=0) except: pass b2 = '中国基督' a['出版时间'] = a['出版时间'].str[0: 5] li2 = a['出版时间'] for i in range(0, len(li2)): try: if b2 in li2[i]: # print(number,li2[i]) number += 1 a = a.drop(i, axis=0) except: pass b3 = 'CNY' li3 = a['价格'] for i in range(0, len(li3)): try: if b3 in li3[i]: a['价格'] = li3.str.replace('CNY', '') except: pass b41 = '清' b42 = '明' li4 = a['国家'] a['国家'] = li4.str.replace("国", "") for i in range(0, len(li4)): try: if b41 in li4[i]: a['国家'] = li4.str.replace('清', '中') if b42 in li4[i]: a['国家'] = li4.str.replace('明', '中') except: pass time.sleep(3) a.to_csv('newbook.csv', index=False)

3.数据存储 3.1 mysql中需要建立的6张表 mysql的6张表 books原始数据表book_country_num出版国家book_publisher_num出版社表book_presstime_num出版时间表book_people_title评论人数top10数据表book_score_num评分数据表

3.2 建表语句 CREATE TABLE `books` ( `title` varchar(255) DEFAULT NULL, `link` varchar(255) DEFAULT NULL, `country` varchar(255) DEFAULT NULL, `author` varchar(255) DEFAULT NULL, `translator` varchar(255) DEFAULT NULL, `publisher` varchar(255) DEFAULT NULL, `press_time` int(11) DEFAULT NULL, `price` decimal(10,2) DEFAULT NULL, `star` float DEFAULT NULL, `score` float DEFAULT NULL, `people` int(11) DEFAULT NULL, `comment` varchar(255) DEFAULT NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8; CREATE TABLE `book_country_num` ( `country` varchar(255) DEFAULT NULL COMMENT '国家', `num` int(11) DEFAULT NULL COMMENT '数量' ) ENGINE=InnoDB DEFAULT CHARSET=utf8; insert into book_country_num select country, count(*) as num from books group by country; CREATE TABLE `book_publisher_num` ( `publisher` varchar(255) DEFAULT NULL COMMENT '出版社', `num` int(11) DEFAULT NULL COMMENT '数量' ) ENGINE=InnoDB DEFAULT CHARSET=utf8; insert into book_publisher_num select publisher, count(*) as num from books group by publisher; CREATE TABLE `book_presstime_num` ( `press_time` int(11) DEFAULT NULL COMMENT '出版时间', `num` int(11) DEFAULT NULL COMMENT '数量' ) ENGINE=InnoDB DEFAULT CHARSET=utf8; insert into book_presstime_num select press_time, count(*) as num from books group by press_time; CREATE TABLE `book_people_title` ( `people` int(11) DEFAULT NULL COMMENT '评论人数', `title` varchar(255) DEFAULT NULL COMMENT '书名' ) ENGINE=InnoDB DEFAULT CHARSET=utf8; insert into book_people_title select people, title from books group by people order by people desc limit 10; CREATE TABLE `book_score_num` ( `score` float DEFAULT NULL COMMENT '评分', `num` int(11) DEFAULT NULL COMMENT '数量' ) ENGINE=InnoDB DEFAULT CHARSET=utf8; insert into book_score_num select score, count(*) as num from books group by score; 3.3将2中清洗后的数据通过navicat导入

4.flask web网站的搭建 4.1 使用pycharm建一个flask项目 4.2 后端 from flask import Flask,render_template import pymysql app = Flask(__name__) @app.route('/') def index(): return render_template("index.html") @app.route('/index') def home(): #return render_template("index.html") return index() @app.route('/movie') def movie(): datalist = [] con = pymysql.connect( host='192.168.43.100', port=3306, user='root', passwd='Root@123', db='doubanbook', charset='utf8' ) cur = con.cursor() sql = "select * from books" data = cur.execute(sql) result=cur.fetchall() for item in result: datalist.append(item) cur.close() cur.close() print(datalist) return render_template("movie.html", movies=datalist) @app.route('/score') def score(): score = [] #评分 num = [] #每个评分所统计出的电影数量 conn = pymysql.Connect( host='192.168.43.100', port=3306, user='root', passwd='Root@123', db='doubanbook', charset='utf8' ) cur = conn.cursor() sql = "select * from book_score_num" data = cur.execute(sql) result = cur.fetchall() for item in result: score.append(str(item[0])) num.append(item[1]) cur.close() conn.close() return render_template("score.html",score= score,num=num) @app.route('/country') def country(): country = [] #评分 num = [] #每个评分所统计出的电影数量 conn = pymysql.Connect( host='192.168.43.100', port=3306, user='root', passwd='Root@123', db='doubanbook', charset='utf8' ) cur = conn.cursor() sql = "select * from book_country_num" data = cur.execute(sql) result = cur.fetchall() for item in result: country.append(str(item[0])) num.append(item[1]) cur.close() conn.close() return render_template("country.html",country=country,num=num) @app.route('/peopletop10') def peopletop10(): people = [] #评论人数 title = [] #书名 s=[] conn = pymysql.Connect( host='192.168.43.100', port=3306, user='root', passwd='Root@123', db='doubanbook', charset='utf8' ) cur = conn.cursor() sql = "select * from book_people_title" data = cur.execute(sql) result = cur.fetchall() for item in result: s.append(item) people.append(str(item[0])) title.append(item[1]) cur.close() conn.close() return render_template("peopletop10.html", people=people,title=title) @app.route('/presstime') def presstime(): year = [] num = [] s=[] conn = pymysql.Connect( host='192.168.43.100', port=3306, user='root', passwd='Root@123', db='doubanbook', charset='utf8' ) cur = conn.cursor() sql = "select * from book_presstime_num" data = cur.execute(sql) result = cur.fetchall() for item in result: s.append(item) year.append(str(item[0])) num.append(item[1]) cur.close() conn.close() return render_template("presstime.html", year=year,num=num) @app.route('/publisher') def publisher(): year = [] num = [] s=[] conn = pymysql.Connect( host='192.168.43.100', port=3306, user='root', passwd='Root@123', db='doubanbook', charset='utf8' ) cur = conn.cursor() sql = "select * from book_publisher_num" data = cur.execute(sql) result = cur.fetchall() for item in result: s.append(item) year.append(str(item[0])) num.append(item[1]) cur.close() conn.close() return render_template("publisher.html", year=year,num=num) @app.route('/word') def word(): return render_template("word.html") @app.route('/team') def team(): return render_template("team.html") if __name__ == '__main__': app.run() 4.3 前端 豆瓣图书Top250数据分析 孙浩 首页 电影 国家 评分 评论人数top10 出版年份 出版社 词云 团队 豆瓣电影Top250电影 书名 作者 译者 出版社 出版时间 价格 星级 评分 评价人数 一句话概述 {% for movie in movies %}


【本文地址】

公司简介

联系我们

今日新闻

    推荐新闻

    专题文章
      CopyRight 2018-2019 实验室设备网 版权所有