from fastapi import FastAPI, HTTPException from pydantic import BaseModel import requests from bs4 import BeautifulSoup import html2text from newspaper import Article app = FastAPI() class URLRequest(BaseModel): url: str @app.post("/extract") def extract_article(request: URLRequest): try: # 1. Essayer d'extraire avec requests + BeautifulSoup response = requests.get(request.url, headers={"User-Agent": "Mozilla/5.0"}) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Extraire le contenu principal (meilleur effort) article_tag = soup.find('article') or soup.find('main') or soup.body html_content = str(article_tag) markdown = html2text.html2text(html_content).strip() title = soup.title.string if soup.title else "Untitled" # 2. Si le markdown est vide ou trop court, fallback sur newspaper3k if len(markdown.split()) < 100: article = Article(request.url) article.download() article.parse() title = article.title or title markdown = html2text.html2text(article.text) return { "title": title, "markdown": markdown } except Exception as e: raise HTTPException(status_code=500, detail=str(e))