Spaces:

holcombzv
/

fake_news_detection_api

Sleeping

App Files Files Community

holcombzv commited on Sep 4, 2025

Commit

609d9fc

1 Parent(s): 8306a27

Updated scraping behavior.

Browse files

Files changed (2) hide show

classes.py +1 -3
functions.py +31 -11

classes.py CHANGED Viewed

@@ -16,9 +16,7 @@ class Article:
     def __init__(self, article_id: int, html: str):
         self.article_id = article_id
         self.html = html
-        self.text = get_article_text(self.html) or ''
-        self.text_length = len(self.text.split(' '))
-        self.paragraphs = split_paragraphs(self.text) or []
         logger.info(f'\nParagraphs read: {len(self.paragraphs)}')
         for i, paragraph in enumerate(self.paragraphs):

     def __init__(self, article_id: int, html: str):
         self.article_id = article_id
         self.html = html
+        self.paragraphs = get_article_text(self.html) or ''
         logger.info(f'\nParagraphs read: {len(self.paragraphs)}')
         for i, paragraph in enumerate(self.paragraphs):

functions.py CHANGED Viewed

@@ -10,22 +10,42 @@ def get_article_text(html_text):
         article_text = []
-        for tag in ["article", "story"]:
-            article_body = soup.find_all(tag)
-            if article_body:
-                article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body])
-        # Plan B: Extract all <p> tags if no article/story tag is found
-        if not article_text:
-            logger.info(f'\nArticle text not found, using plan B')
-            article_body = soup.find_all('p')
-            if article_body:
-                article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body])
-        return "\n".join(article_text) if article_text else "No article text found."
     except Exception as e:
         logger.exception(f'Error: Could not retrieve article text: {e}')
 def split_paragraphs(text: str):
     paragraphs = text.splitlines()

         article_text = []
+        # Step 1: Try <article> tag
+        article_tag = soup.find('article')
+        if article_tag:
+            paragraphs = article_tag.find_all('p')
+            article_text = [clean_text(p.get_text(strip=True)) for p in paragraphs if p.get_text(strip=True)]
+            if article_text:
+                return article_text
+        # Step 2: Try common container patterns (site-specific fallbacks)
+        candidates = [
+            {"name": "div", "attrs": {"class": "article-body"}},
+            {"name": "section", "attrs": {"name": "articleBody"}},
+            {"name": "div", "attrs": {"property": "articleBody"}},
+            {"name": "div", "attrs": {"class": "Article__content"}},
+        ]
+        for cand in candidates:
+            container = soup.find(cand["name"], cand["attrs"])
+            if container:
+                paragraphs = container.find_all('p')
+                article_text = [clean_text(p.get_text(strip=True)) for p in paragraphs if p.get_text(strip=True)]
+                if article_text:
+                    return article_text
+        # Step 3: Fallback → all <p> tags, but filter out junk
+        bad_classes = ['caption', 'credit', 'advertisement', 'footer']
+        for p in soup.find_all('p'):
+            if not any(cls in (p.get('class') or []) for cls in bad_classes):
+                text = p.get_text(strip=True)
+                if text:
+                    article_text.append(clean_text(text))
+        return article_text  # Always return a list (may be empty)
     except Exception as e:
         logger.exception(f'Error: Could not retrieve article text: {e}')
+        return []
 def split_paragraphs(text: str):
     paragraphs = text.splitlines()