java爬取前程无忧(51job) 您所在的位置:网站首页 爬取51job招聘网数据beautifulsoup java爬取前程无忧(51job)

java爬取前程无忧(51job)

2024-06-02 07:40| 来源: 网络整理| 查看: 265

什么是Jsoup

jsoup是一款优秀的Java的HTML解析器,主要用来对HTML解析,就是dom的操作,有很多和js方法一样,如getElementById,select等,使用起来非常的方便,不清楚的朋友可以去学习下这里就不强调了。

爬取前程无忧准备工作

1.创建一个普通的maven工程 在这里插入图片描述 2.在 pom.xml文件中引入依赖

org.apache.httpcomponents httpclient 4.5.9 org.apache.httpcomponents httpcore 4.4.11 org.jsoup jsoup 1.12.1 分析51job网站

在这里插入图片描述 如何获取所有的城市和代码? 在这里插入图片描述 在这里插入图片描述

开始爬取51job

观察51job的字段创建对应的实体类表 在这里插入图片描述 创建Jobs实体类

public class Jobs { private Integer jobId;//自动增长id private String jobName; //岗位 private String companyName;//公司名 private String workAddr;//公司地址 private String salary;//薪水 private String pushDate;//发布日期 private String jobKey;//对应城市的id public Integer getJobId() { return jobId; } public void setJobId(Integer jobId) { this.jobId = jobId; } public String getJobName() { return jobName; } public void setJobName(String jobName) { this.jobName = jobName; } public String getCompanyName() { return companyName; } public void setCompanyName(String companyName) { this.companyName = companyName; } public String getWorkAddr() { return workAddr; } public void setWorkAddr(String workAddr) { this.workAddr = workAddr; } public String getSalary() { return salary; } public void setSalary(String salary) { this.salary = salary; } public String getPushDate() { return pushDate; } public void setPushDate(String pushDate) { this.pushDate = pushDate; } public String getJobKey() { return jobKey; } public void setJobKey(String jobKey) { this.jobKey = jobKey; }

创建JobParse

public class JobParse { public static List getData(String entity){ /** * 读取mybatis配置文件 */ String resource = "mybatis-config.xml"; InputStream inputStream = null; try { inputStream = Resources.getResourceAsStream(resource); } catch (IOException e) { e.printStackTrace(); } /** * 得到连接对象注册sqlsession */ SqlSessionFactory sqlSessionFactory = new SqlSessionFactoryBuilder().build(inputStream); SqlSession sqlSession = sqlSessionFactory.openSession(); JobsMapper jobsMapper = sqlSession.getMapper(JobsMapper.class); List data = new ArrayList(); Document doc = Jsoup.parse(entity); Elements elements = doc.select("div.el"); Elements title = elements.select("p.t1").select("span").select("a"); //标题 Elements complany = elements.select("span.t2").select("a"); //公司 Elements address = elements.select("span.t3");//地址 Elements salary = elements.select("span.t4");//薪水 Elements datas = elements.select("span.t5");//发布日期 Elements SrcId = elements.select("p.t1").select("input.checkbox");//招聘信息对应的id Jobs jobs = new Jobs(); if (title !=null || title.equals("")) { for (Element element : title) { jobs.setJobName(element.text()); } } if (complany !=null || complany.equals("")) { for (Element element : complany) { jobs.setCompanyName(element.text()); } } if (address !=null || address.equals("")) { for (Element element : address) { jobs.setWorkAddr(element.text()); } } if (salary !=null || salary.equals("")) { for (Element element : salary) { jobs.setSalary(element.text()); } } if (datas !=null || datas.equals("")) { for (Element element : datas) { jobs.setPushDate(element.text()); } } if (SrcId !=null || SrcId.equals("")) { for (Element element : SrcId) { jobs.setJobKey(element.attr("value")); } } jobsMapper.insert(jobs); sqlSession.commit(); data.add(jobs); return data; }

创建爬取主启动程序JobMain

public class JobMain { public static void main(String[] args) { System.out.println("正在生成客户端..."); HttpClient client = null; System.out.println("客户端生成完毕."); String[] city = {"阿坝", "阿克苏", "阿拉尔", "阿拉善盟", "阿勒泰", "阿里", "鞍山", "安康", "安庆", "安顺", "安阳", "巴彦淖尔", "巴音郭楞", "巴中", "白城", "白沙", "白山", "白银", "百色", "蚌埠", "包头", "保定", "保山", "保亭", "宝鸡", "北海", "北京", "本溪", "毕节", "滨州", "博尔塔拉", "亳州", "沧州", "昌都", "昌吉", "昌江", "常德", "常熟", "常州", "长春", "长沙", "长治", "朝阳", "潮州", "郴州", "成都", "澄迈", "承德", "池州", "赤峰", "崇左", "滁州", "楚雄", "重庆", "达州", "大理", "大连", "大庆", "大同", "大兴安岭", "丹东", "丹阳", "德宏", "德阳", "德州", "邓州", "迪庆", "定安", "定西", "东方", "东营", "东莞", "儋州", "鄂尔多斯", "鄂州", "恩施", "防城港", "佛山", "福州", "抚顺", "抚州", "阜新", "阜阳", "甘南", "甘孜", "赣州", "固原", "广安", "广元", "广州", "桂林", "贵港", "贵阳", "果洛", "哈尔滨", "哈密", "海北", "海东", "海口", "海南", "海宁", "海西", "邯郸", "汉中", "杭州", "菏泽", "和田", "合肥", "河池", "河源", "鹤壁", "鹤岗", "贺州", "黑河", "衡水", "衡阳", "红河州", "呼和浩特", "呼伦贝尔", "葫芦岛", "湖州", "怀化", "淮安", "淮北", "淮南", "黄冈", "黄南", "黄山", "黄石", "惠州", "鸡西", "吉安", "吉林", "济南", "济宁", "济源", "嘉兴", "嘉峪关", "佳木斯", "江门", "焦作", "揭阳", "金昌", "金华", "锦州", "晋城", "晋中", "荆门", "荆州", "景德镇", "靖江", "九江", "酒泉", "喀什地区", "开封", "开平", "克拉玛依", "克孜勒苏柯尔克孜", "昆明", "昆山", "拉萨", "莱芜", "来宾", "兰州", "廊坊", "乐山", "丽江", "丽水", "连云港", "凉山", "聊城", "辽阳", "辽源", "林芝", "临沧", "临汾", "临高", "临夏", "临沂", "陵水", "柳州", "六安", "六盘水", "龙岩", "陇南", "娄底", "吕梁", "洛阳", "泸州", "漯河", "马鞍山", "茂名", "梅州", "眉山", "绵阳", "牡丹江", "那曲", "南昌", "南充", "南京", "南宁", "南平", "南通", "南阳", "内江", "宁波", "宁德", "怒江", "攀枝花", "盘锦", "萍乡", "平顶山", "平凉", "莆田", "普洱", "濮阳", "七台河", "齐齐哈尔", "黔东南", "黔南", "黔西南", "潜江", "钦州", "秦皇岛", "青岛", "清远", "庆阳", "琼海", "琼中", "曲靖", "泉州", "衢州", "日喀则", "日照", "三门峡", "三明", "三沙", "三亚", "山南", "汕头", "汕尾", "商洛", "商丘", "上海", "上饶", "韶关", "邵阳", "绍兴", "深圳", "神农架", "沈阳", "十堰", "石河子", "石家庄", "石嘴山", "双鸭山", "朔州", "四平", "松原", "苏州", "宿迁", "宿州", "随州", "绥化", "遂宁", "塔城", "台州", "泰安", "泰兴", "泰州", "太仓", "太原", "唐山", "天津", "天门", "天水", "铁岭", "通化", "通辽", "铜川", "铜陵", "铜仁", "图木舒克", "吐鲁番", "屯昌", "万宁", "威海", "潍坊", "渭南", "温州", "文昌", "文山", "乌海", "乌兰察布", "乌鲁木齐", "无锡", "芜湖", "梧州", "吴忠", "武汉", "武威", "五家渠", "五指山", "西安", "西昌", "西宁", "西双版纳", "锡林郭勒盟", "厦门", "仙桃", "咸宁", "咸阳", "襄阳", "湘潭", "湘西", "孝感", "新乡", "新余", "忻州", "信阳", "兴安盟", "邢台", "雄安新区", "徐州", "许昌", "宣城", "乐东", "雅安", "烟台", "盐城", "延安", "延边", "延吉", "燕郊开发区", "杨凌", "扬州", "洋浦经济开发区", "阳江", "阳泉", "伊春", "伊犁", "宜宾", "宜昌", "宜春", "义乌", "益阳", "银川", "鹰潭", "营口", "永州", "榆林", "玉林", "玉树", "玉溪", "岳阳", "云浮", "运城", "枣庄", "湛江", "漳州", "张家港", "张家界", "张家口", "张掖", "昭通", "肇庆", "镇江", "郑州", "中山", "中卫", "舟山", "周口", "珠海", "株洲", "驻马店", "资阳", "淄博", "自贡", "遵义", "广东省", "江苏省", "浙江省", "四川省", "海南省", "福建省", "山东省", "江西省", "广西", "安徽省", "河北省", "河南省", "湖北省", "湖南省", "陕西省", "山西省", "黑龙江省", "辽宁省", "吉林省", "云南省", "贵州省", "甘肃省", "内蒙古", "宁夏", "西藏", "新疆", "青海省", "香港", "澳门", "台湾", "国外"}; String[] value = { "092200","310600","310900","281500","311300", "300800","230400","201000","150400","260500","170900","280900","311800","092000","241000","101800","240900","270800","141100","150600","280400","160400","251200","101700","200400","140500","010000","231000","260700","121500","311900","151800", "160800","300600","311200","101900","190700","070700","070500","240200","190200","210600","231400","032000","190900","090200","101300","161000","151500","280300","141400","150900","251700","060000", "091700","250500","230300","220500","210400","221400","230800","072100","251600","090600","121300","172000","252000","101100","271100","100900","121000","030800","100800","280800","181000","181800","140800", "030600","110200","230600","131100","231500","150700","271500","092100","130800","290600","091300","091600","030200","140300","141000","260200","320800","220200","310700","320500","320300","100200","320700","081600","320400","160700","200900","080200", "121400","311600","150200","141200","032100","171700","221000","141500","221200","161200","190500","251000","280200","281100","230900","080900","191100","071900","151700","151100","181100","320600","151000","180400","030300", "220900","130900", "240300","120200","120900","171900","080700","270400","220800","031500","170500","032200","270300","080600","230700","210700","211000","180800","180700","130400","072500","130300","270500","310400","170400","032700","310300","311700","250200","070600", "300200","121800","141300","270200","160300","090400","250600","081000","071200","092300","121700","231100","240400","300400","251800","210500","101400","271400","120800","102100","140400","151200","260400","111000","271200","191200","211200", "170300","090500","171500","150500","032300","032600","091200","090300","220700","300700","130200","091100","070200","140200","110800","070900","170600","090900","080300","110900","251900", "091000","231300","130500","171000","271000","110600","251100","171600","221300","220600","260900","261000","260800","181500","140900","160600","120300","031900","271300","100600","101600","250300","110400","081200","300300","121200", "171800","110700","101500","100300","300500","030400","032400","201100","171300","020000","131200","031400","191000","080500","040000","181700","230200","180600","310800","160200","290500","221100","210900","240600","240700","070300","072000","151600","181200","220400","091500","311500", "080800","121100","072300","071800","071600","210200","160500","050000","181600","270600","231200","240500","280700","200500","150800","260600","311100","311400","101200","100700","120600","120500","200700","080400","100500","251400","281000","281200", "310200", "070400", "150300", "140700", "290300", "180200", "270700", "311000", "101000", "200200", "091900", "320200", "251500", "281400", "110300", "181400", "181300", "200300", "180500", "190400", "191500", "180900", "170700", "130600", "211100", "171200", "281300", "161100", "160100", "071100", "171100", "151400","102000","091800","120400","071300","200600","241100","240800","161300","201200","070800","100400","032800","210800","220300","310500","090700","180300","131000","081400","190800","290200","130700","230500","191300","200800","140600","320900","250400","190600","032900","210300","121600","031700","110500","071400","191400","160900","270900","251300","031800","071000","170200","030700", "290400","081100","170800","030500","190300","171400","091400","120700","090800","260300","030000","070000","080000","090000","100000","110000","120000","130000","140000","150000","160000","170000","180000","190000","200000","210000", "220000","230000","240000","250000","260000","270000","280000","290000","300000","310000","320000","330000","340000","350000","360000" }; int pagesize = 1; boolean splider = true; for (int num = 0; num String url = "https://search.51job.com/list/"+ value[num] + ",000000,0000,00,9,99," + city[num] + ",2," + pagesize++ + ".html"; System.out.println(url); List jobsList = null; System.out.println("正在生成客户端..."); client = HttpClientBuilder.create().build(); System.out.println("客户端生成完毕."); //开始解析 try { System.out.println("开始响应客户端..."); try { Thread.sleep(200); jobsList = URLHandle.urlParser(client, url); if (jobsList.iterator().next().getJob_name().equals("")){ pagesize = 1; break; } } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("响应完成."); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } System.out.println("开始输出结果..."); for (Jobs job : jobsList) { System.out.println(job.getJob_name()); } System.out.println("整个结果输出完毕,程序结束."); } } }

其中value和city是从51job获取的全部城名和城市代码,这里使用的是while循环,当jobsList中的某一个字段为空的时候说明后面已经没有信息了,里面结束当前循环,并且pagesize初始化为1,下一个城市从第一页开始。就这样反复的轮训city和value数组的城市名和代码来进行爬取,然后通过mybatis插入到数据库。

数据库配置

mapper

public interface JobsMapper { void insert(Jobs jobs); List findAll(); }

jdbc.properties

driver=com.mysql.jdbc.Driver url=jdbc:mysql://localhost:3306/job51?ServerTimezone=GMT%2B8 username=root password=root

mybatis-config.xml

mapper.xml

insert into jobs (job_name,company_name,work_addr,salary,push_date,job_key) values (#{jobName},#{companyName},#{workAddr},#{salary},#{pushDate},#{jobKey} ) SELECT * FROM jobs

在这里插入图片描述 在这里插入图片描述

项目地址

github

总结

爬取51job很简单,这里有个小坑就是该网站的编码是gbk,你没有听错就是gbk,转换成utf-8还不行,还是会乱码,所以这里使用gbk编码。



【本文地址】

公司简介

联系我们

今日新闻

    推荐新闻

    专题文章
      CopyRight 2018-2019 实验室设备网 版权所有