Spaces:

KJ24
/

article-extractor

Sleeping

File size: 1,342 Bytes

1c0805d
 
bea80f3
 
 
979ab71
1c0805d
 
 
 
 
 
 
 
 
979ab71
 
bea80f3
 
 
979ab71
bea80f3
 
 
979ab71
bea80f3
1c0805d
979ab71
 
 
 
 
 
 
 
 
1c0805d
bea80f3
 
1c0805d
bea80f3
1c0805d

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import requests
from bs4 import BeautifulSoup
import html2text
from newspaper import Article

app = FastAPI()

class URLRequest(BaseModel):
    url: str

@app.post("/extract")
def extract_article(request: URLRequest):
    try:
        # 1. Essayer d'extraire avec requests + BeautifulSoup
        response = requests.get(request.url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extraire le contenu principal (meilleur effort)
        article_tag = soup.find('article') or soup.find('main') or soup.body
        html_content = str(article_tag)

        markdown = html2text.html2text(html_content).strip()
        title = soup.title.string if soup.title else "Untitled"

        # 2. Si le markdown est vide ou trop court, fallback sur newspaper3k
        if len(markdown.split()) < 100:
            article = Article(request.url)
            article.download()
            article.parse()

            title = article.title or title
            markdown = html2text.html2text(article.text)

        return {
            "title": title,
            "markdown": markdown
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))