WeByT3 commited on
Commit
4c0107d
·
verified ·
1 Parent(s): 4d3f295

Create utilities.py

Browse files
Files changed (1) hide show
  1. utilities.py +20 -0
utilities.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wikipedia
2
+ import re
3
+
4
+ def clean_and_truncate(text, max_chars=2000):
5
+ """Clean text and truncate to avoid excessive length."""
6
+ text = re.sub(r'\n{2,}', '\n', text) # remove extra newlines
7
+ return text[:max_chars] + ("..." if len(text) > max_chars else "")
8
+
9
+ def extract_year_range(text, start_year, end_year):
10
+ """Extract only content that mentions years in a certain range."""
11
+ pattern = rf"\b(19[0-9]{{2}}|20[0-2][0-9])\b"
12
+ matches = re.finditer(pattern, text)
13
+ filtered_paragraphs = []
14
+ for match in matches:
15
+ year = int(match.group())
16
+ if start_year <= year <= end_year:
17
+ # capture paragraph
18
+ para = text[max(0, text.rfind('\n', 0, match.start())) : text.find('\n', match.end())]
19
+ filtered_paragraphs.append(para.strip())
20
+ return "\n\n".join(filtered_paragraphs)