AbhishekParanjape commited on
Commit ·
e707907
1
Parent(s): ef981ba
undo semantic chunking
Browse files- rag_system.py +7 -1
rag_system.py
CHANGED
|
@@ -199,6 +199,7 @@ class DocumentIngestion:
|
|
| 199 |
st.error("No embedding model available. Please install sentence-transformers or provide OpenAI API key.")
|
| 200 |
raise Exception("No embedding model available")
|
| 201 |
|
|
|
|
| 202 |
self.text_splitter = SemanticChunker(
|
| 203 |
embeddings_model=self.embeddings,
|
| 204 |
chunk_size=4, # 4 sentences per base chunk
|
|
@@ -207,9 +208,14 @@ class DocumentIngestion:
|
|
| 207 |
min_chunk_size=150, # Minimum 150 characters
|
| 208 |
max_chunk_size=1500, # Maximum 1500 characters
|
| 209 |
debug=True # Show statistics in Streamlit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
)
|
| 211 |
|
| 212 |
-
st.info(f"🧠 Using semantic chunking with {self.embedding_type} embeddings")
|
| 213 |
self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
|
| 214 |
os.makedirs(self.persist_directory, exist_ok=True)
|
| 215 |
|
|
|
|
| 199 |
st.error("No embedding model available. Please install sentence-transformers or provide OpenAI API key.")
|
| 200 |
raise Exception("No embedding model available")
|
| 201 |
|
| 202 |
+
"""""
|
| 203 |
self.text_splitter = SemanticChunker(
|
| 204 |
embeddings_model=self.embeddings,
|
| 205 |
chunk_size=4, # 4 sentences per base chunk
|
|
|
|
| 208 |
min_chunk_size=150, # Minimum 150 characters
|
| 209 |
max_chunk_size=1500, # Maximum 1500 characters
|
| 210 |
debug=True # Show statistics in Streamlit
|
| 211 |
+
)"""
|
| 212 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 213 |
+
chunk_size=1000,
|
| 214 |
+
chunk_overlap=200,
|
| 215 |
+
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
| 216 |
)
|
| 217 |
|
| 218 |
+
"""st.info(f"🧠 Using semantic chunking with {self.embedding_type} embeddings")"""
|
| 219 |
self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
|
| 220 |
os.makedirs(self.persist_directory, exist_ok=True)
|
| 221 |
|