AbhishekParanjape commited on
Commit
e707907
·
1 Parent(s): ef981ba

undo semantic chunking

Browse files
Files changed (1) hide show
  1. rag_system.py +7 -1
rag_system.py CHANGED
@@ -199,6 +199,7 @@ class DocumentIngestion:
199
  st.error("No embedding model available. Please install sentence-transformers or provide OpenAI API key.")
200
  raise Exception("No embedding model available")
201
 
 
202
  self.text_splitter = SemanticChunker(
203
  embeddings_model=self.embeddings,
204
  chunk_size=4, # 4 sentences per base chunk
@@ -207,9 +208,14 @@ class DocumentIngestion:
207
  min_chunk_size=150, # Minimum 150 characters
208
  max_chunk_size=1500, # Maximum 1500 characters
209
  debug=True # Show statistics in Streamlit
 
 
 
 
 
210
  )
211
 
212
- st.info(f"🧠 Using semantic chunking with {self.embedding_type} embeddings")
213
  self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
214
  os.makedirs(self.persist_directory, exist_ok=True)
215
 
 
199
  st.error("No embedding model available. Please install sentence-transformers or provide OpenAI API key.")
200
  raise Exception("No embedding model available")
201
 
202
+ """""
203
  self.text_splitter = SemanticChunker(
204
  embeddings_model=self.embeddings,
205
  chunk_size=4, # 4 sentences per base chunk
 
208
  min_chunk_size=150, # Minimum 150 characters
209
  max_chunk_size=1500, # Maximum 1500 characters
210
  debug=True # Show statistics in Streamlit
211
+ )"""
212
+ self.text_splitter = RecursiveCharacterTextSplitter(
213
+ chunk_size=1000,
214
+ chunk_overlap=200,
215
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
216
  )
217
 
218
+ """st.info(f"🧠 Using semantic chunking with {self.embedding_type} embeddings")"""
219
  self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
220
  os.makedirs(self.persist_directory, exist_ok=True)
221