Spaces:

KJ24
/

article-extractor

Runtime error

KJ24 commited on May 1, 2025

Commit

979ab71

verified ·

1 Parent(s): c346316

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from pydantic import BaseModel
 import requests
 from bs4 import BeautifulSoup
 import html2text
 app = FastAPI()
@@ -12,20 +13,27 @@ class URLRequest(BaseModel):
 @app.post("/extract")
 def extract_article(request: URLRequest):
     try:
-        response = requests.get(request.url)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
-        # Chercher le bloc principal d'article (à affiner si besoin)
         article_tag = soup.find('article') or soup.find('main') or soup.body
         html_content = str(article_tag)
-        # Convertir HTML → Markdown
-        markdown = html2text.html2text(html_content)
         title = soup.title.string if soup.title else "Untitled"
         return {
             "title": title,
             "markdown": markdown

 import requests
 from bs4 import BeautifulSoup
 import html2text
+from newspaper import Article
 app = FastAPI()
 @app.post("/extract")
 def extract_article(request: URLRequest):
     try:
+        # 1. Essayer d'extraire avec requests + BeautifulSoup
+        response = requests.get(request.url, headers={"User-Agent": "Mozilla/5.0"})
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
+        # Extraire le contenu principal (meilleur effort)
         article_tag = soup.find('article') or soup.find('main') or soup.body
         html_content = str(article_tag)
+        markdown = html2text.html2text(html_content).strip()
         title = soup.title.string if soup.title else "Untitled"
+        # 2. Si le markdown est vide ou trop court, fallback sur newspaper3k
+        if len(markdown.split()) < 100:
+            article = Article(request.url)
+            article.download()
+            article.parse()
+            title = article.title or title
+            markdown = html2text.html2text(article.text)
         return {
             "title": title,
             "markdown": markdown