holcombzv commited on
Commit
609d9fc
·
1 Parent(s): 8306a27

Updated scraping behavior.

Browse files
Files changed (2) hide show
  1. classes.py +1 -3
  2. functions.py +31 -11
classes.py CHANGED
@@ -16,9 +16,7 @@ class Article:
16
  def __init__(self, article_id: int, html: str):
17
  self.article_id = article_id
18
  self.html = html
19
- self.text = get_article_text(self.html) or ''
20
- self.text_length = len(self.text.split(' '))
21
- self.paragraphs = split_paragraphs(self.text) or []
22
  logger.info(f'\nParagraphs read: {len(self.paragraphs)}')
23
 
24
  for i, paragraph in enumerate(self.paragraphs):
 
16
  def __init__(self, article_id: int, html: str):
17
  self.article_id = article_id
18
  self.html = html
19
+ self.paragraphs = get_article_text(self.html) or ''
 
 
20
  logger.info(f'\nParagraphs read: {len(self.paragraphs)}')
21
 
22
  for i, paragraph in enumerate(self.paragraphs):
functions.py CHANGED
@@ -10,22 +10,42 @@ def get_article_text(html_text):
10
 
11
  article_text = []
12
 
13
- for tag in ["article", "story"]:
14
- article_body = soup.find_all(tag)
15
- if article_body:
16
- article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body])
 
 
 
17
 
18
- # Plan B: Extract all <p> tags if no article/story tag is found
19
- if not article_text:
20
- logger.info(f'\nArticle text not found, using plan B')
21
- article_body = soup.find_all('p')
22
- if article_body:
23
- article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body])
 
 
 
 
 
 
 
 
24
 
25
- return "\n".join(article_text) if article_text else "No article text found."
 
 
 
 
 
 
 
 
26
 
27
  except Exception as e:
28
  logger.exception(f'Error: Could not retrieve article text: {e}')
 
29
 
30
  def split_paragraphs(text: str):
31
  paragraphs = text.splitlines()
 
10
 
11
  article_text = []
12
 
13
+ # Step 1: Try <article> tag
14
+ article_tag = soup.find('article')
15
+ if article_tag:
16
+ paragraphs = article_tag.find_all('p')
17
+ article_text = [clean_text(p.get_text(strip=True)) for p in paragraphs if p.get_text(strip=True)]
18
+ if article_text:
19
+ return article_text
20
 
21
+ # Step 2: Try common container patterns (site-specific fallbacks)
22
+ candidates = [
23
+ {"name": "div", "attrs": {"class": "article-body"}},
24
+ {"name": "section", "attrs": {"name": "articleBody"}},
25
+ {"name": "div", "attrs": {"property": "articleBody"}},
26
+ {"name": "div", "attrs": {"class": "Article__content"}},
27
+ ]
28
+ for cand in candidates:
29
+ container = soup.find(cand["name"], cand["attrs"])
30
+ if container:
31
+ paragraphs = container.find_all('p')
32
+ article_text = [clean_text(p.get_text(strip=True)) for p in paragraphs if p.get_text(strip=True)]
33
+ if article_text:
34
+ return article_text
35
 
36
+ # Step 3: Fallback all <p> tags, but filter out junk
37
+ bad_classes = ['caption', 'credit', 'advertisement', 'footer']
38
+ for p in soup.find_all('p'):
39
+ if not any(cls in (p.get('class') or []) for cls in bad_classes):
40
+ text = p.get_text(strip=True)
41
+ if text:
42
+ article_text.append(clean_text(text))
43
+
44
+ return article_text # Always return a list (may be empty)
45
 
46
  except Exception as e:
47
  logger.exception(f'Error: Could not retrieve article text: {e}')
48
+ return []
49
 
50
  def split_paragraphs(text: str):
51
  paragraphs = text.splitlines()