Ali Abdullah commited on
Commit
aec7049
·
verified ·
1 Parent(s): 24d4205

Update web_scraper.py

Browse files
Files changed (1) hide show
  1. web_scraper.py +2 -1
web_scraper.py CHANGED
@@ -44,7 +44,7 @@ class WebScraper:
44
  }
45
 
46
  class TextChunker:
47
- def __init__(self, chunk_size: int = 500, overlap: int = 50):
48
  """
49
  Initialize text chunker
50
  Args:
@@ -78,6 +78,7 @@ class TextChunker:
78
  # If adding this sentence would exceed chunk size, create a new chunk
79
  if current_length + sentence_length > self.chunk_size and current_chunk:
80
  chunk_text = ' '.join(current_chunk)
 
81
  chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
82
 
83
  # Start new chunk with overlap
 
44
  }
45
 
46
  class TextChunker:
47
+ def __init__(self, chunk_size: int = 100, overlap: int = 20):
48
  """
49
  Initialize text chunker
50
  Args:
 
78
  # If adding this sentence would exceed chunk size, create a new chunk
79
  if current_length + sentence_length > self.chunk_size and current_chunk:
80
  chunk_text = ' '.join(current_chunk)
81
+ print(f"📄 Chunk {len(chunks)}:\n{text[:150]}...\n")
82
  chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
83
 
84
  # Start new chunk with overlap