Spaces:
Sleeping
Sleeping
Ali Abdullah
commited on
Update web_scraper.py
Browse files- web_scraper.py +2 -1
web_scraper.py
CHANGED
|
@@ -44,7 +44,7 @@ class WebScraper:
|
|
| 44 |
}
|
| 45 |
|
| 46 |
class TextChunker:
|
| 47 |
-
def __init__(self, chunk_size: int =
|
| 48 |
"""
|
| 49 |
Initialize text chunker
|
| 50 |
Args:
|
|
@@ -78,6 +78,7 @@ class TextChunker:
|
|
| 78 |
# If adding this sentence would exceed chunk size, create a new chunk
|
| 79 |
if current_length + sentence_length > self.chunk_size and current_chunk:
|
| 80 |
chunk_text = ' '.join(current_chunk)
|
|
|
|
| 81 |
chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
|
| 82 |
|
| 83 |
# Start new chunk with overlap
|
|
|
|
| 44 |
}
|
| 45 |
|
| 46 |
class TextChunker:
|
| 47 |
+
def __init__(self, chunk_size: int = 100, overlap: int = 20):
|
| 48 |
"""
|
| 49 |
Initialize text chunker
|
| 50 |
Args:
|
|
|
|
| 78 |
# If adding this sentence would exceed chunk size, create a new chunk
|
| 79 |
if current_length + sentence_length > self.chunk_size and current_chunk:
|
| 80 |
chunk_text = ' '.join(current_chunk)
|
| 81 |
+
print(f"📄 Chunk {len(chunks)}:\n{text[:150]}...\n")
|
| 82 |
chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
|
| 83 |
|
| 84 |
# Start new chunk with overlap
|