Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import html2text | |
| from newspaper import Article | |
| app = FastAPI() | |
| class URLRequest(BaseModel): | |
| url: str | |
| def extract_article(request: URLRequest): | |
| try: | |
| # 1. Essayer d'extraire avec requests + BeautifulSoup | |
| response = requests.get(request.url, headers={"User-Agent": "Mozilla/5.0"}) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Extraire le contenu principal (meilleur effort) | |
| article_tag = soup.find('article') or soup.find('main') or soup.body | |
| html_content = str(article_tag) | |
| markdown = html2text.html2text(html_content).strip() | |
| title = soup.title.string if soup.title else "Untitled" | |
| # 2. Si le markdown est vide ou trop court, fallback sur newspaper3k | |
| if len(markdown.split()) < 100: | |
| article = Article(request.url) | |
| article.download() | |
| article.parse() | |
| title = article.title or title | |
| markdown = html2text.html2text(article.text) | |
| return { | |
| "title": title, | |
| "markdown": markdown | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |