Spaces:

KJ24
/

article-extractor

Runtime error

KJ24 commited on May 1, 2025

Commit

bea80f3

verified ·

1 Parent(s): 2ce25e8

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from newspaper import Article
 app = FastAPI()
@@ -10,15 +12,24 @@ class URLRequest(BaseModel):
 @app.post("/extract")
 def extract_article(request: URLRequest):
     try:
-        article = Article(request.url)
-        article.download()
-        article.parse()
         return {
-            "title": article.title,
-            "content": article.text,
-            "author": article.authors,
-            "date": str(article.publish_date) if article.publish_date else None
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+import requests
+from bs4 import BeautifulSoup
+import html2text
 app = FastAPI()
 @app.post("/extract")
 def extract_article(request: URLRequest):
     try:
+        response = requests.get(request.url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Chercher le bloc principal d'article (à affiner si besoin)
+        article_tag = soup.find('article') or soup.find('main') or soup.body
+        html_content = str(article_tag)
+        # Convertir HTML → Markdown
+        markdown = html2text.html2text(html_content)
+        title = soup.title.string if soup.title else "Untitled"
         return {
+            "title": title,
+            "markdown": markdown
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))