

# 基础安装pip install scrapling# 安装反爬/浏览器依赖pip install "scrapling[fetchers]"scrapling install# 全功能安装(含AI/Shell)pip install "scrapling[all]"
`docker pull pyd4vinci/scrapling``docker pull ghcr.io/d4vinci/scrapling:latest`from scrapling.fetchers import Fetcher# 发起请求(若该网站解析失败,可替换为其他可正常访问的网页URL)page = Fetcher.get("https://quotes.toscrape.com/")# 提取数据quotes = page.css(".quote .text::text").getall()authors = page.css(".quote .author::text").getall()print(list(zip(quotes, authors)))
from scrapling.fetchers import StealthyFetcher# 自动绕过各类验证(该网站正常访问时支持验证绕过测试,解析失败可稍后重试)page = StealthyFetcher.fetch("https://nopecha.com/demo/cloudflare")data = page.css("#padded_content a").getall()print(data)
from scrapling.spiders import Spider, Responseclass QuotesSpider(Spider):name = "quotes"start_urls = ["https://quotes.toscrape.com/"] # 解析失败时可替换URLasync def parse(self, response: Response):for quote in response.css(".quote"):yield {"text": quote.css(".text::text").get(),"author": quote.css(".author::text").get()}# 自动翻页next_page = response.css(".next a::attr(href)").get()if next_page:yield response.follow(next_page)# 启动爬虫result = QuotesSpider().start()result.items.to_json("quotes.json")
from scrapling.spiders import Spider, Request, Responsefrom scrapling.fetchers import FetcherSession, AsyncStealthySessionclass MultiSessionSpider(Spider):name = "multi"start_urls = ["https://example.com/"] # 该URL解析失败,替换为可正常访问的URLdef configure_sessions(self, manager):# 配置两种Session:快速HTTP请求(fast)、隐身绕过(stealth)manager.add("fast", FetcherSession(impersonate="chrome"))manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)async def parse(self, response: Response):for link in response.css('a::attr(href)').getall():# 受保护页面路由到隐身Session,普通页面使用快速Sessionif "protected" in link:yield Request(link, sid="stealth")else:yield Request(link, sid="fast", callback=self.parse)
import asynciofrom scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession# FetcherSession 支持同步/异步两种模式async with FetcherSession(http3=True) as session:page1 = session.get('https://example.com/page1') # 该URL解析失败page2 = session.get('https://example.com/page2', impersonate='firefox135') # 该URL解析失败# 异步Session并发请求示例async with AsyncStealthySession(max_pages=2) as session:tasks = []urls = ['https://example.com/page1', 'https://example.com/page2'] # 两个URL均解析失败,需替换为可正常访问URLfor url in urls:task = session.fetch(url)tasks.append(task)# 查看浏览器标签池状态(可选)print(session.get_pool_stats())results = await asyncio.gather(*tasks)print(session.get_pool_stats())
# 1. 启动交互式Web Scraping Shell(集成IPython,支持快捷操作)scrapling shell# 2. 基础提取:抓取网页内容并导出为指定格式(txt/md/html)# 示例1:提取网页全文,导出为md文件scrapling extract get 'https://example.com' content.md# 示例2:指定CSS选择器提取目标内容,模拟Chrome浏览器请求scrapling extract get 'https://example.com' content.txt --css-selector '#target-element' --impersonate 'chrome'# 示例3:使用隐身模式,自动绕过Cloudflare等验证提取内容scrapling extract stealthy-fetch 'https://example.com/protected-page' result.html --solve-cloudflare# 3. 动态渲染页面提取(适配JS渲染的动态内容)scrapling extract dynamic-fetch 'https://example.com/dynamic-page' dynamic-result.txt --wait 3 # 等待3秒加载动态内容# 4. 查看CLI帮助文档,获取所有可用命令及参数scrapling --helpscrapling extract --help # 查看提取相关命令详情
https://github.com/D4Vinci/Scrapling
终身学习,深耕AI领域
持续分享,优质AI开源