yalrashed commited on
Commit
9879995
·
verified ·
1 Parent(s): 1508d36

Upload analysis_cleaner.py

Browse files
Files changed (1) hide show
  1. src/analysis/analysis_cleaner.py +60 -0
src/analysis/analysis_cleaner.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ class AnalysisCleaner:
4
+ def __init__(self):
5
+ self.seen_paragraphs = set()
6
+
7
+ def remove_duplicates(self, text: str) -> str:
8
+ """Remove duplicate paragraphs while preserving order"""
9
+ paragraphs = text.split('\n\n')
10
+ unique_paragraphs = []
11
+
12
+ for paragraph in paragraphs:
13
+ # Create a normalized version for comparison
14
+ normalized = ' '.join(paragraph.lower().split())
15
+ if normalized and normalized not in self.seen_paragraphs:
16
+ self.seen_paragraphs.add(normalized)
17
+ unique_paragraphs.append(paragraph)
18
+
19
+ return '\n\n'.join(unique_paragraphs)
20
+
21
+ def reorganize_content(self, text: str) -> str:
22
+ """Convert bullet points into flowing paragraphs"""
23
+ lines = text.split('\n')
24
+ current_paragraph = []
25
+ flowing_text = []
26
+
27
+ for line in lines:
28
+ # Remove bullet points and numbering
29
+ cleaned_line = re.sub(r'^\s*[\*\-\•]\s*', '', line)
30
+ cleaned_line = re.sub(r'^\s*\d+\.\s*', '', cleaned_line)
31
+
32
+ if cleaned_line.strip():
33
+ if cleaned_line.startswith('###'): # Keep section headers
34
+ if current_paragraph:
35
+ flowing_text.append(' '.join(current_paragraph))
36
+ current_paragraph = []
37
+ flowing_text.append(cleaned_line)
38
+ else:
39
+ current_paragraph.append(cleaned_line)
40
+ elif current_paragraph:
41
+ flowing_text.append(' '.join(current_paragraph))
42
+ current_paragraph = []
43
+
44
+ if current_paragraph:
45
+ flowing_text.append(' '.join(current_paragraph))
46
+
47
+ return '\n\n'.join(flowing_text)
48
+
49
+ def clean_analysis(self, text: str) -> str:
50
+ """Apply all cleanup steps"""
51
+ # Remove duplicate content
52
+ cleaned = self.remove_duplicates(text)
53
+
54
+ # Convert to flowing paragraphs
55
+ cleaned = self.reorganize_content(cleaned)
56
+
57
+ # Clean up extra whitespace
58
+ cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
59
+
60
+ return cleaned