WeByT3 commited on
Commit
c3d7a61
·
verified ·
1 Parent(s): 0c6cece

Delete utilities.py

Browse files
Files changed (1) hide show
  1. utilities.py +0 -20
utilities.py DELETED
@@ -1,20 +0,0 @@
1
- import wikipedia
2
- import re
3
-
4
- def clean_and_truncate(text, max_chars=2000):
5
- """Clean text and truncate to avoid excessive length."""
6
- text = re.sub(r'\n{2,}', '\n', text) # remove extra newlines
7
- return text[:max_chars] + ("..." if len(text) > max_chars else "")
8
-
9
- def extract_year_range(text, start_year, end_year):
10
- """Extract only content that mentions years in a certain range."""
11
- pattern = rf"\b(19[0-9]{{2}}|20[0-2][0-9])\b"
12
- matches = re.finditer(pattern, text)
13
- filtered_paragraphs = []
14
- for match in matches:
15
- year = int(match.group())
16
- if start_year <= year <= end_year:
17
- # capture paragraph
18
- para = text[max(0, text.rfind('\n', 0, match.start())) : text.find('\n', match.end())]
19
- filtered_paragraphs.append(para.strip())
20
- return "\n\n".join(filtered_paragraphs)