Spaces:
Sleeping
Sleeping
File size: 1,342 Bytes
1c0805d bea80f3 979ab71 1c0805d 979ab71 bea80f3 979ab71 bea80f3 979ab71 bea80f3 1c0805d 979ab71 1c0805d bea80f3 1c0805d bea80f3 1c0805d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import requests
from bs4 import BeautifulSoup
import html2text
from newspaper import Article
app = FastAPI()
class URLRequest(BaseModel):
url: str
@app.post("/extract")
def extract_article(request: URLRequest):
try:
# 1. Essayer d'extraire avec requests + BeautifulSoup
response = requests.get(request.url, headers={"User-Agent": "Mozilla/5.0"})
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extraire le contenu principal (meilleur effort)
article_tag = soup.find('article') or soup.find('main') or soup.body
html_content = str(article_tag)
markdown = html2text.html2text(html_content).strip()
title = soup.title.string if soup.title else "Untitled"
# 2. Si le markdown est vide ou trop court, fallback sur newspaper3k
if len(markdown.split()) < 100:
article = Article(request.url)
article.download()
article.parse()
title = article.title or title
markdown = html2text.html2text(article.text)
return {
"title": title,
"markdown": markdown
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
|