Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -645,7 +645,7 @@ class EnhancedPDFProcessor:
|
|
| 645 |
not clean_text.isdigit() and # Not just numbers
|
| 646 |
(word_count > 0 or len(clean_text) > 30)) # Has common words or substantial length
|
| 647 |
|
| 648 |
-
def chunk_text(self, text: str, chunk_size: int =
|
| 649 |
"""Split text into overlapping chunks"""
|
| 650 |
if not text or len(text.strip()) < 50:
|
| 651 |
return []
|
|
@@ -701,7 +701,7 @@ class GoogleT5Model:
|
|
| 701 |
self.tokenizer = None
|
| 702 |
self.available = False
|
| 703 |
# Use even smaller model for HF Spaces compatibility
|
| 704 |
-
self.model_name = "google/flan-t5-
|
| 705 |
|
| 706 |
if libs.get('torch') and libs.get('transformers'):
|
| 707 |
self._initialize_model()
|
|
|
|
| 645 |
not clean_text.isdigit() and # Not just numbers
|
| 646 |
(word_count > 0 or len(clean_text) > 30)) # Has common words or substantial length
|
| 647 |
|
| 648 |
+
def chunk_text(self, text: str, chunk_size: int = 300, overlap: int = 50) -> List[str]:
|
| 649 |
"""Split text into overlapping chunks"""
|
| 650 |
if not text or len(text.strip()) < 50:
|
| 651 |
return []
|
|
|
|
| 701 |
self.tokenizer = None
|
| 702 |
self.available = False
|
| 703 |
# Use even smaller model for HF Spaces compatibility
|
| 704 |
+
self.model_name = "google/flan-t5-base"
|
| 705 |
|
| 706 |
if libs.get('torch') and libs.get('transformers'):
|
| 707 |
self._initialize_model()
|