import json
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
# news_url = base_url + snumber
async def extract_news_article(news_url):
schema = {
"name": "AIbase News Article",
"baseSelector": "div.pb-32",
"fields": [
{
"name": "title",
"selector": "h1",
"type": "text",
},
{
"name": "publication_date",
"selector": "div.flex.flex-col > div.flex.flex-wrap > span:nth-child(6)",
"type": "text",
},
{
"name": "content",
"selector": "div.post-content",
"type": "text",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url=news_url,
extraction_strategy=extraction_strategy,
bypass_cache=True,
)
if not result.success:
print("error")
return
extracted_data = json.loads(result.extracted_content)
return extracted_data