File size: 1,342 Bytes
1c0805d
 
bea80f3
 
 
979ab71
1c0805d
 
 
 
 
 
 
 
 
979ab71
 
bea80f3
 
 
979ab71
bea80f3
 
 
979ab71
bea80f3
1c0805d
979ab71
 
 
 
 
 
 
 
 
1c0805d
bea80f3
 
1c0805d
bea80f3
1c0805d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import requests
from bs4 import BeautifulSoup
import html2text
from newspaper import Article

app = FastAPI()

class URLRequest(BaseModel):
    url: str

@app.post("/extract")
def extract_article(request: URLRequest):
    try:
        # 1. Essayer d'extraire avec requests + BeautifulSoup
        response = requests.get(request.url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extraire le contenu principal (meilleur effort)
        article_tag = soup.find('article') or soup.find('main') or soup.body
        html_content = str(article_tag)

        markdown = html2text.html2text(html_content).strip()
        title = soup.title.string if soup.title else "Untitled"

        # 2. Si le markdown est vide ou trop court, fallback sur newspaper3k
        if len(markdown.split()) < 100:
            article = Article(request.url)
            article.download()
            article.parse()

            title = article.title or title
            markdown = html2text.html2text(article.text)

        return {
            "title": title,
            "markdown": markdown
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))