通过CrawlSpider对招聘网站进行整站爬取

数据表结构设计

建立数据表sql语句参考

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
CREATE TABLE `article_spider`.`Untitled`  (
`title` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`url` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`salary` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`job_city` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`work_years` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`degree_need` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`job_type` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`publish_time` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`job_advantage` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '',
`job_desc` longtext CHARACTER SET utf8 COLLATE utf8_general_ci NULL,
`job_addr` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`company_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`company_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`url_object_id` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`tags` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`crawl_time` datetime(0) NULL DEFAULT NULL,
PRIMARY KEY (`url_object_id`) USING BTREE
) ENGINE = MyISAM CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;

新建CrawlSpider与settings配置

进入虚拟环境后,新建CrawlSpider

1
scrapy genspider -t crawl lagou www.lagou.com

如果发现失败,需要进入settings.py引入路径

1
2
3
import sys
BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
sys.path.insert(0, os.path.join(BASE_DIR, 'ArticleSpider'))

拉勾网302之后的模拟登录和cookie传递

可以设置为适合自己的快捷键方式。
快捷键设置

selenium模拟登录后拿到cookie交给scrapy的request使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#lagou.py
def start_requests(self):
BASE_DIR = "C:/Users/yjw55/ArticleSpider/ArticleSpider"
#去使用selenium模拟登录后拿到cookie交给scrapy的request使用
# 通过selenium模拟登录
#从文件中读取cookies
cookies = []
cookies = []
from selenium import webdriver
import time
browser = webdriver.Chrome(executable_path="C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
browser.get("https://passport.lagou.com/login/login.html")
#判断文件是否存在
if os.path.exists(BASE_DIR + "/cookies/lagou.cookie"):
cookies = pickle.load(open(BASE_DIR + "/cookies/lagou.cookie", "rb"))
for cookie in cookies:
browser.add_cookie(cookie)
browser.get("https://www.lagou.com/")

if not cookies:
browser.get("https://passport.lagou.com/login/login.html")
browser.find_element_by_css_selector(".form_body .input_white").send_keys("1318")
browser.find_element_by_css_selector('.form_body input[type="password"]').send_keys("me3")
browser.find_element_by_css_selector('div[data-view="passwordLogin"] input.btn_lg').click()
import time
time.sleep(8)
cookies = browser.get_cookies()
# 写入cookie到文件中
time.sleep(8)
pickle.dump(cookies,open(BASE_DIR + "/cookies/lagou.cookie", "wb"))


cookie_dict = {}
for cookie in cookies:
cookie_dict[cookie["name"]] = cookie["value"]

#重写了,就要把之前它的逻辑拷贝一份
for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict)

settings.py设置全局COOKIES

1
2
3
4
COOKIES_ENABLED = True
COOKIES_DEBUG = True

USER_AGENT = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"

item loader方式解析职位

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# lagou.py
class LagouSpider(CrawlSpider):
name = 'lagou'
allowed_domains = ['www.lagou.com']
start_urls = ['http://www.lagou.com/']

rules = (
Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True),
Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True),
Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),
)

def parse_job(self, response):
item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
item_loader.add_css("title", ".job-name::attr(title)")
item_loader.add_value("url", response.url)
item_loader.add_value("url_object_id", get_md5(response.url))
item_loader.add_css("salary", ".job_request .salary::text")
item_loader.add_xpath("job_city", "//*[@class='job_request']//span[2]/text()")
item_loader.add_xpath("work_years", "//*[@class='job_request']//span[3]/text()")
item_loader.add_xpath("degree_need", "//*[@class='job_request']//span[4]/text()")
item_loader.add_xpath("job_type", "//*[@class='job_request']//span[5]/text()")

item_loader.add_css("tags", '.position-label li::text')
item_loader.add_css("publish_time", ".publish_time::text")
item_loader.add_css("job_advantage", ".job-advantage p::text")
item_loader.add_css("job_desc", ".job_bt div")
item_loader.add_css("job_addr", ".work_addr")
item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
item_loader.add_css("company_url", "#job_company dt a::attr(href)")
item_loader.add_value("crawl_time", datetime.now())

job_item = item_loader.load_item()
return job_item

职位数据入库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# items.py
from w3lib.html import remove_tags

def remove_splash(value):
#去掉工作城市的斜线
return value.replace("/","")

def handle_jobaddr(value):
addr_list = value.split("\n")
addr_list = [item.strip() for item in addr_list if item.strip()!="查看地图"]
return "".join(addr_list)

class LagouJobItemLoader(ItemLoader):
#自定义itemloader
default_output_processor = TakeFirst()

class LagouJobItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
url_object_id = scrapy.Field()
salary = scrapy.Field()
job_city = scrapy.Field(
#入库之前要执行的动作
input_processor=MapCompose(remove_splash),
)
work_years = scrapy.Field(
input_processor=MapCompose(remove_splash),
)
degree_need = scrapy.Field(
input_processor=MapCompose(remove_splash),
)
job_type = scrapy.Field()
publish_time = scrapy.Field()
job_advantage = scrapy.Field()
job_desc = scrapy.Field()
job_addr = scrapy.Field(
input_processor=MapCompose(remove_tags, handle_jobaddr),
)
company_name = scrapy.Field()
company_url = scrapy.Field()
tags = scrapy.Field(
input_processor=Join(",")
)

crawl_time = scrapy.Field()

def get_insert_sql(self):
insert_sql = """
insert into lagou_job(title, url, url_object_id, salary, job_city, work_years, degree_need,
job_type, publish_time, job_advantage, job_desc, job_addr, company_name, company_url,
tags, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc)
"""
params = (
self.get("title", ""),
self.get("url", ""),
self.get("url_object_id", ""),
self.get("salary", ""),
self.get("job_city", ""),
self.get("work_years", ""),
self.get("degree_need", ""),
self.get("job_type", ""),
self.get("publish_time", "0000-00-00"),
self.get("job_advantage", ""),
self.get("job_desc", ""),
self.get("job_addr", ""),
self.get("company_name", ""),
self.get("company_url", ""),
self.get("job_addr", ""),
self["crawl_time"].strftime(SQL_DATETIME_FORMAT),
)

return insert_sql, params

设置settings.py

1
2
3
ITEM_PIPELINES = {
'ArticleSpider.pipelines.MysqlTwistedPipline': 4,
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#pipelines.py
class MysqlTwistedPipline(object):
def __init__(self, dbpool):
self.dbpool = dbpool

@classmethod
def from_settings(cls, settings):
dbparms = dict(
host = settings["MYSQL_HOST"],
db = settings["MYSQL_DBNAME"],
user = settings["MYSQL_USER"],
passwd = settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)

return cls(dbpool)

def process_item(self, item, spider):
#使用twisted将mysql插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider) #处理异常
return item

def handle_error(self, failure, item, spider):
#处理异步插入的异常
print (failure)

def do_insert(self, cursor, item):
#执行具体的插入
#根据不同的item 构建不同的sql语句并插入到mysql中
insert_sql, params = item.get_insert_sql()
cursor.execute(insert_sql, params)

网站反爬突破

RANDOMIZE_DOWNLOAD_DELAY:默认: True

如果启用,当从相同的网站获取数据时,Scrapy将会等待一个随机的值 (0.5到1.5之间的一个随机值 * DOWNLOAD_DELAY)。

该随机值降低了crawler被检测到(接着被block)的机会。某些网站会分析请求, 查找请求之间时间的相似性。

1
2
3
#settings.py
DOWNLOAD_DELAY = 3
RANDOMIZE_DOWNLOAD_DELAY = True