Spaces:

Tesneem
/

document_chunker

Sleeping

App Files Files Community

Tesneem commited on Aug 8, 2025

Commit

25d9750

verified ·

1 Parent(s): 08637b4

Update document_chunker.py

Browse files

Files changed (1) hide show

document_chunker.py +18 -22

document_chunker.py CHANGED Viewed

@@ -7,6 +7,7 @@ from dataclasses import dataclass
 from docx import Document
 from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import TfidfVectorizer
 @dataclass
@@ -56,37 +57,20 @@ class DocumentChunker:
             }
         }
-    def match_category(self, text: str, return_first: bool = True) -> Optional[str] or List[str]:
-        lower_text = text.lower()
-        match_scores = defaultdict(int)
-        for category, patterns in self.category_patterns.items():
-            for pattern in patterns:
-                matches = re.findall(pattern, lower_text)
-                match_scores[category] += len(matches)
-        if not match_scores:
-            return None if return_first else []
-        sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
-        return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
-    # def extract_text_from_docx(self, file_path: str) -> str:
-    #     doc = Document(file_path)
-    #     return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
     def extract_text(self, file_path: str) -> str:
         if file_path.endswith(".docx"):
             doc = Document(file_path)
             return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
         elif file_path.endswith(".pdf"):
-            import fitz  # PyMuPDF
             text = ""
             with fitz.open(file_path) as doc:
                 for page in doc:
                     text += page.get_text()
             return text
-        else:
             return Path(file_path).read_text()
     def detect_document_type(self, text: str) -> str:
         keywords = ['grant', 'funding', 'mission']
@@ -109,7 +93,6 @@ class DocumentChunker:
         chunks = []
         if not headers:
-            # fallback chunking
             words = text.split()
             for i in range(0, len(words), max_words):
                 piece = ' '.join(words[i:i + max_words])
@@ -140,6 +123,20 @@ class DocumentChunker:
                 })
         return chunks
     def extract_topics_tfidf(self, text: str, max_features: int = 3) -> List[str]:
         clean = re.sub(r'[^\w\s]', ' ', text.lower())
         vectorizer = TfidfVectorizer(max_features=max_features * 2, stop_words='english')
@@ -158,7 +155,6 @@ class DocumentChunker:
     def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
         file_path = Path(file_path)
-        # text = self.extract_text_from_docx(str(file_path)) if file_path.suffix == ".docx" else file_path.read_text()
         text = self.extract_text(str(file_path))
         doc_type = self.detect_document_type(text)
         headers = self.extract_headers(text, doc_type)

 from docx import Document
 from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import TfidfVectorizer
+import fitz  # PyMuPDF
 @dataclass
             }
         }
     def extract_text(self, file_path: str) -> str:
         if file_path.endswith(".docx"):
             doc = Document(file_path)
             return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
         elif file_path.endswith(".pdf"):
             text = ""
             with fitz.open(file_path) as doc:
                 for page in doc:
                     text += page.get_text()
             return text
+        elif file_path.endswith(".txt"):
             return Path(file_path).read_text()
+        else:
+            raise ValueError("Unsupported file format")
     def detect_document_type(self, text: str) -> str:
         keywords = ['grant', 'funding', 'mission']
         chunks = []
         if not headers:
             words = text.split()
             for i in range(0, len(words), max_words):
                 piece = ' '.join(words[i:i + max_words])
                 })
         return chunks
+    def match_category(self, text: str, return_first: bool = True) -> Optional[str] or List[str]:
+        lower_text = text.lower()
+        match_scores = defaultdict(int)
+        for category, patterns in self.category_patterns.items():
+            for pattern in patterns:
+                matches = re.findall(pattern, lower_text)
+                match_scores[category] += len(matches)
+        if not match_scores:
+            return None if return_first else []
+        sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
+        return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
     def extract_topics_tfidf(self, text: str, max_features: int = 3) -> List[str]:
         clean = re.sub(r'[^\w\s]', ' ', text.lower())
         vectorizer = TfidfVectorizer(max_features=max_features * 2, stop_words='english')
     def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
         file_path = Path(file_path)
         text = self.extract_text(str(file_path))
         doc_type = self.detect_document_type(text)
         headers = self.extract_headers(text, doc_type)