1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
| from w3lib.html import remove_tags
def remove_splash(value): return value.replace("/","")
def handle_jobaddr(value): addr_list = value.split("\n") addr_list = [item.strip() for item in addr_list if item.strip()!="查看地图"] return "".join(addr_list)
class LagouJobItemLoader(ItemLoader): default_output_processor = TakeFirst()
class LagouJobItem(scrapy.Item): title = scrapy.Field() url = scrapy.Field() url_object_id = scrapy.Field() salary = scrapy.Field() job_city = scrapy.Field( input_processor=MapCompose(remove_splash), ) work_years = scrapy.Field( input_processor=MapCompose(remove_splash), ) degree_need = scrapy.Field( input_processor=MapCompose(remove_splash), ) job_type = scrapy.Field() publish_time = scrapy.Field() job_advantage = scrapy.Field() job_desc = scrapy.Field() job_addr = scrapy.Field( input_processor=MapCompose(remove_tags, handle_jobaddr), ) company_name = scrapy.Field() company_url = scrapy.Field() tags = scrapy.Field( input_processor=Join(",") )
crawl_time = scrapy.Field()
def get_insert_sql(self): insert_sql = """ insert into lagou_job(title, url, url_object_id, salary, job_city, work_years, degree_need, job_type, publish_time, job_advantage, job_desc, job_addr, company_name, company_url, tags, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc) """ params = ( self.get("title", ""), self.get("url", ""), self.get("url_object_id", ""), self.get("salary", ""), self.get("job_city", ""), self.get("work_years", ""), self.get("degree_need", ""), self.get("job_type", ""), self.get("publish_time", "0000-00-00"), self.get("job_advantage", ""), self.get("job_desc", ""), self.get("job_addr", ""), self.get("company_name", ""), self.get("company_url", ""), self.get("job_addr", ""), self["crawl_time"].strftime(SQL_DATETIME_FORMAT), )
return insert_sql, params
|