KJ24 commited on
Commit
979ab71
·
verified ·
1 Parent(s): c346316

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -6
app.py CHANGED
@@ -3,6 +3,7 @@ from pydantic import BaseModel
3
  import requests
4
  from bs4 import BeautifulSoup
5
  import html2text
 
6
 
7
  app = FastAPI()
8
 
@@ -12,20 +13,27 @@ class URLRequest(BaseModel):
12
  @app.post("/extract")
13
  def extract_article(request: URLRequest):
14
  try:
15
- response = requests.get(request.url)
 
16
  response.raise_for_status()
17
  soup = BeautifulSoup(response.text, 'html.parser')
18
 
19
- # Chercher le bloc principal d'article (à affiner si besoin)
20
  article_tag = soup.find('article') or soup.find('main') or soup.body
21
-
22
  html_content = str(article_tag)
23
 
24
- # Convertir HTML → Markdown
25
- markdown = html2text.html2text(html_content)
26
-
27
  title = soup.title.string if soup.title else "Untitled"
28
 
 
 
 
 
 
 
 
 
 
29
  return {
30
  "title": title,
31
  "markdown": markdown
 
3
  import requests
4
  from bs4 import BeautifulSoup
5
  import html2text
6
+ from newspaper import Article
7
 
8
  app = FastAPI()
9
 
 
13
  @app.post("/extract")
14
  def extract_article(request: URLRequest):
15
  try:
16
+ # 1. Essayer d'extraire avec requests + BeautifulSoup
17
+ response = requests.get(request.url, headers={"User-Agent": "Mozilla/5.0"})
18
  response.raise_for_status()
19
  soup = BeautifulSoup(response.text, 'html.parser')
20
 
21
+ # Extraire le contenu principal (meilleur effort)
22
  article_tag = soup.find('article') or soup.find('main') or soup.body
 
23
  html_content = str(article_tag)
24
 
25
+ markdown = html2text.html2text(html_content).strip()
 
 
26
  title = soup.title.string if soup.title else "Untitled"
27
 
28
+ # 2. Si le markdown est vide ou trop court, fallback sur newspaper3k
29
+ if len(markdown.split()) < 100:
30
+ article = Article(request.url)
31
+ article.download()
32
+ article.parse()
33
+
34
+ title = article.title or title
35
+ markdown = html2text.html2text(article.text)
36
+
37
  return {
38
  "title": title,
39
  "markdown": markdown