KJ24's picture
Update app.py
979ab71 verified
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import requests
from bs4 import BeautifulSoup
import html2text
from newspaper import Article
app = FastAPI()
class URLRequest(BaseModel):
url: str
@app.post("/extract")
def extract_article(request: URLRequest):
try:
# 1. Essayer d'extraire avec requests + BeautifulSoup
response = requests.get(request.url, headers={"User-Agent": "Mozilla/5.0"})
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extraire le contenu principal (meilleur effort)
article_tag = soup.find('article') or soup.find('main') or soup.body
html_content = str(article_tag)
markdown = html2text.html2text(html_content).strip()
title = soup.title.string if soup.title else "Untitled"
# 2. Si le markdown est vide ou trop court, fallback sur newspaper3k
if len(markdown.split()) < 100:
article = Article(request.url)
article.download()
article.parse()
title = article.title or title
markdown = html2text.html2text(article.text)
return {
"title": title,
"markdown": markdown
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))