Spaces:

yalrashed
/

ScriptLLM

Sleeping

App Files Files Community

yalrashed commited on Dec 5, 2024

Commit

9879995

verified ·

1 Parent(s): 1508d36

Upload analysis_cleaner.py

Browse files

Files changed (1) hide show

src/analysis/analysis_cleaner.py +60 -0

src/analysis/analysis_cleaner.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import re
+class AnalysisCleaner:
+    def __init__(self):
+        self.seen_paragraphs = set()
+    def remove_duplicates(self, text: str) -> str:
+        """Remove duplicate paragraphs while preserving order"""
+        paragraphs = text.split('\n\n')
+        unique_paragraphs = []
+        for paragraph in paragraphs:
+            # Create a normalized version for comparison
+            normalized = ' '.join(paragraph.lower().split())
+            if normalized and normalized not in self.seen_paragraphs:
+                self.seen_paragraphs.add(normalized)
+                unique_paragraphs.append(paragraph)
+        return '\n\n'.join(unique_paragraphs)
+    def reorganize_content(self, text: str) -> str:
+        """Convert bullet points into flowing paragraphs"""
+        lines = text.split('\n')
+        current_paragraph = []
+        flowing_text = []
+        for line in lines:
+            # Remove bullet points and numbering
+            cleaned_line = re.sub(r'^\s*[\*\-\•]\s*', '', line)
+            cleaned_line = re.sub(r'^\s*\d+\.\s*', '', cleaned_line)
+            if cleaned_line.strip():
+                if cleaned_line.startswith('###'):  # Keep section headers
+                    if current_paragraph:
+                        flowing_text.append(' '.join(current_paragraph))
+                        current_paragraph = []
+                    flowing_text.append(cleaned_line)
+                else:
+                    current_paragraph.append(cleaned_line)
+            elif current_paragraph:
+                flowing_text.append(' '.join(current_paragraph))
+                current_paragraph = []
+        if current_paragraph:
+            flowing_text.append(' '.join(current_paragraph))
+        return '\n\n'.join(flowing_text)
+    def clean_analysis(self, text: str) -> str:
+        """Apply all cleanup steps"""
+        # Remove duplicate content
+        cleaned = self.remove_duplicates(text)
+        # Convert to flowing paragraphs
+        cleaned = self.reorganize_content(cleaned)
+        # Clean up extra whitespace
+        cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
+        return cleaned