1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
| import os
import csv
from bs4 import BeautifulSoup
import re
# 文件夹路径
folder_path = './saved_pages'
output_csv = 'products.csv'
# CSV 表头
fields = [
'商品标题', '封面链接', '价格', '付款人数', '省份', '城市',
'支持的服务', '旗舰店昵称', '旗舰店tag'
]
all_products = []
def parse_sales(sales_text):
"""保留原始付款人数字符串"""
if not sales_text:
return ''
return sales_text.strip()
# 遍历文件夹中的所有 HTML 文件
for filename in os.listdir(folder_path):
if filename.endswith('.html'):
with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
# 找到所有商品的容器,这里改为通用 div 包含标题的结构
items = soup.find_all('div', class_=re.compile(r'title--'))
for item in items:
# 商品标题
title = item.get('title') or item.get_text(strip=True)
# 封面图
img_tag = item.find_previous('img')
img_url = img_tag['src'] if img_tag else ''
# 价格
price_parent = item.find_next('div', class_=re.compile(r'priceWrapper--'))
if price_parent:
price_int_tag = price_parent.find('div', class_=re.compile(r'priceInt--'))
price_float_tag = price_parent.find('div', class_=re.compile(r'priceFloat--'))
price = f"{price_int_tag.get_text(strip=True) if price_int_tag else ''}{price_float_tag.get_text(strip=True) if price_float_tag else ''}"
# 付款人数
sales_tag = price_parent.find('span', class_=re.compile(r'realSales--'))
sales = parse_sales(sales_tag.get_text(strip=True) if sales_tag else '')
# 省份和城市
area_tags = price_parent.find_all('div', class_=re.compile(r'procity--'))
province = area_tags[0].get_text(strip=True) if len(area_tags) > 0 else ''
city = area_tags[1].get_text(strip=True) if len(area_tags) > 1 else ''
else:
price = sales = province = city = ''
# 支持的服务
services_parent = item.find_next('div', class_=re.compile(r'subIconWrapper--'))
services_tags = services_parent.find_all('span') if services_parent else []
services = ','.join([s.get_text(strip=True) for s in services_tags]) if services_tags else ''
# 店铺信息
shop_parent = item.find_next('a', class_=re.compile(r'shopName--'))
shop_name_tag = shop_parent.find('span', class_=re.compile(r'shopNameText--')) if shop_parent else None
shop_tag_tag = shop_parent.find('span', class_=re.compile(r'shopTagText--')) if shop_parent else None
shop_name = shop_name_tag.get_text(strip=True) if shop_name_tag else ''
shop_tag = shop_tag_tag.get_text(strip=True) if shop_tag_tag else ''
all_products.append([
title, img_url, price, sales, province, city,
services, shop_name, shop_tag
])
# 保存到 CSV
with open(output_csv, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(fields)
writer.writerows(all_products)
print(f"解析完成,已保存 {len(all_products)} 条商品到 {output_csv}")
|