查看原文
其他

爬虫分享——公司信息爬虫

灰熊 Python分析与挖掘 2024-04-01

    前面也分享了比较多的机器学习方面的内容,这次来点开发类的技术分享吧。这次给大家带来一个香港公司相关信息的爬虫分享,由于是爬虫类的分享内容,我们大致过过整个流程后,就直接上代码吧~

整体项目流程

  • 熟悉页面结构,以及了解需要获取的相关字段内容

  • 通过scrapy框架进行获取、解析页面内容

  • 将相关数据进行结构化存储进mysql数据库中

  • 成果展示


一、首先熟悉页面结构,以及了解需要获取的相关字段内容

1、进入目标网站我们可以找到相关的行业,这里是我们要爬取的一级页面

2、然后我们再遍历进入对应的行业,即可进入到二级页面内容

3、接着再遍历每个公司,即可进入到三级页面内容

上述就是我们这次要爬取及遍历的目标,爬取后再将相应字段进行解析后即可相关内容。

二、通过scrapy框架进行获取、解析页面内容(spider文件)

import scrapyfrom .util import *from lxml import etree
from company_crawl.items import CompanyCrawlItem

class HkCompanySpider(scrapy.Spider): name = 'hk_company' start_urls = ['https://hongkong.mingluji.com/Industry_Index'] headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
def start_requests(self): yield scrapy.FormRequest(url=self.start_urls[0], headers=self.headers, callback=self.parse_url_one)
def parse_url_one(self, response): """ 获取所有行业的名称、url、公司数量、页数 """ html = etree.HTML(response.text)
all_industry_name_list = html.xpath('//*[@id="mw-content-text"]/table/tr/td[2]/a/text()') all_industry_url_list = html.xpath('//*[@id="mw-content-text"]/table/tr/td[2]/a/@href') all_industry_url_list_new = list(map(lambda x: "https://hongkong.mingluji.com" + str(x), all_industry_url_list))
all_industry_company_num = html.xpath('//*[@id="mw-content-text"]/table/tr/td[3]/text()') all_industry_company_num = list(map(lambda x: int(x.split(' ')[0].replace(',', '')), all_industry_company_num)) all_industry_company_page_num = html.xpath('//*[@id="mw-content-text"]/table/tr/td[4]/text()') all_industry_company_page_num = list( map(lambda x: int(x.split(' ')[0].replace(',', '')), all_industry_company_page_num))
all_content_level_1 = list(zip(all_industry_name_list, all_industry_url_list_new, all_industry_company_num, all_industry_company_page_num)) for content in all_content_level_1[:]: industry_name = content[0] industry_url = content[1] company_nums = content[2] page_num = content[-1] for page in range(page_num): yield scrapy.FormRequest(industry_url + '/' + str(page), headers=self.headers, callback=self.parse_url_two, meta={'industry_name':industry_name, 'company_nums':company_nums, 'industry_pages':page_num})
def parse_url_two(self, response): industry_name = response.meta['industry_name'] company_nums = response.meta['company_nums'] industry_pages = response.meta['industry_pages'] html_1 = etree.HTML(response.text) all_company_url_list = html_1.xpath('//*[@id="mw-content-text"]/table/tr/td/ol/li/a/@href') all_company_url_list_new = list(map(lambda x:"https://hongkong.mingluji.com"+str(x), all_company_url_list)) all_company_name_list = html_1.xpath('//*[@id="mw-content-text"]/table/tr/td/ol/li/a/text()') all_company_content_list = list(zip(all_company_url_list_new, all_company_name_list)) for company_content in all_company_content_list: company_name = company_content[1] company_url = company_content[0] yield scrapy.FormRequest(url=company_url, headers=self.headers, callback=self.parse_url_three, meta={'industry_name':industry_name, 'company_nums':company_nums, 'industry_pages':industry_pages, 'company_url':company_url})
def parse_url_three(self, response): res_content = get_level_3_content(response.text) item = CompanyCrawlItem() item['company_name'] = res_content[0] item['company_url'] = response.meta['company_url'] item['contact_person'] = res_content[1] item['contact_person_job'] = res_content[2] item['company_address'] = res_content[3] item['company_telephone'] = res_content[4] item['company_fax_number'] = res_content[5] item['company_email'] = res_content[6] item['company_country'] = res_content[7] item['company_postal_code'] = res_content[-1] item['industry_pages'] = response.meta['industry_pages'] item['industry_name'] = response.meta['industry_name']        yield item
# -*- coding: utf-8 -*-# @Organization : # @Author : hhx# @Time : 2023/1/27 11:49 上午# @Email : phhx223@163.com
import re
"""正则中.无法代替\n,\s,此时得使用[\s\S]既能代表全部字符该目录为三级页面的解析代码"""

def get_level_3_content(url_text): url_text = str(url_text) def exit_or_not(list_): pattern_str_sub = re.compile('<.*?>') if list_: return re.sub(pattern_str_sub, '', list_[0]).replace('\n', '') else: return None
pattern_company_name = re.compile('公司名稱[\s\S]*?"name">(.*?)<') pattern_contact_person = re.compile('聯繫人員[\s\S]*?"name">(.*?)<') pattern_job_title = re.compile('工作職務[\s\S]*?"jobTitle">(.*?)<') pattern_Address = re.compile('辦公地址[\s\S]*?"address">(.[\s\S]*?)</dd>') pattern_telephone = re.compile('電話號碼[\s\S]*?"telephone">(.[\s\S]*?)<') pattern_fax_number = re.compile('傳真號碼[\s\S]*?"faxNumber">(.[\s\S]*?)<') pattern_email = re.compile('電子郵箱[\s\S]*?"email">(.[\s\S]*?)<') pattern_Website_url = re.compile('網站網址[\s\S]*?"location">(.[\s\S]*?)<') pattern_postal_code = re.compile('郵政編碼[\s\S]*?>(.[\s\S]*?)</dd>')
company_name = exit_or_not(re.findall(pattern_company_name, url_text)) contact_person = exit_or_not(re.findall(pattern_contact_person, url_text)) job_title = exit_or_not(re.findall(pattern_job_title, url_text)) Address = exit_or_not(re.findall(pattern_Address, url_text)) telephone = exit_or_not(re.findall(pattern_telephone, url_text)) fax_number = exit_or_not(re.findall(pattern_fax_number, url_text)) email = exit_or_not(re.findall(pattern_email, url_text)) Website_url = exit_or_not(re.findall(pattern_Website_url, url_text)) postal_code = exit_or_not(re.findall(pattern_postal_code, url_text))
return [company_name, contact_person, job_title, Address, telephone,            fax_number, email, Website_url, postal_code]

三、将相关数据进行结构化存储进mysql数据库中

# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# useful for handling different item types with a single interfacefrom itemadapter import ItemAdapter
import datetimeimport pymysql

class CompanyCrawlPipeline:
def __init__(self): self.now = str(datetime.date.today())        self.connect = pymysql.connect(host='localhost', user='root', password='*******', db='Crawl_info', port=3306) self.cursor = self.connect.cursor()
def process_item(self, item, spider):
company_name = item['company_name'] company_url = item['company_url'] contact_person = item['contact_person'] contact_person_job = item['contact_person_job'] company_address = item['company_address'] company_telephone = item['company_telephone'] company_fax_number = item['company_fax_number'] company_email = item['company_email'] company_country = item['company_country'] company_postal_code = item['company_postal_code'] industry_pages = item['industry_pages'] industry_name = item['industry_name']
self.cursor.execute('insert ignore into hk_company(company_name, company_url, \ contact_person, contact_person_job, company_address, company_telephone, company_fax_number, company_email, \ company_country, company_postal_code, industry_pages, industry_name, crawl_time)values \ ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(company_name, company_url, contact_person, contact_person_job, company_address, company_telephone, company_fax_number, company_email, company_country, company_postal_code, industry_pages, industry_name, self.now)) self.connect.commit() return item
def spider_closed(self): self.cursor.close()        self.connect.close()

四、成果展示
     运行上述的代码,并进入到相应数据库中即可看到相应公司数据。如图所示

    友情提示:切记勿作商业用途,仅供个人学习使用!有关代码的地方,如有不懂的地方欢迎留言!

    

继续滑动看下一个
向上滑动看下一个

您可能也对以下帖子感兴趣

文章有问题?点此查看未经处理的缓存