dure-waseem commited on
Commit
bf08844
Β·
1 Parent(s): a7017a6

initial code

Browse files
Files changed (2) hide show
  1. app.py +0 -353
  2. chromadb_upload.py +0 -283
app.py CHANGED
@@ -1,356 +1,3 @@
1
- # import gradio as gr
2
- # import os
3
- # import tempfile
4
- # import shutil
5
- # from chromadb_query import ChromaCollection
6
- # from chromadb_upload import ChromaUploader
7
-
8
- # # Global variables to store instances
9
- # chroma_collection = None
10
- # chroma_uploader = None
11
- # current_api_key = None
12
-
13
- # def initialize_chroma_components(api_key):
14
- # """Initialize ChromaDB components with the provided API key"""
15
- # global chroma_collection, chroma_uploader, current_api_key
16
-
17
- # if not api_key:
18
- # return "❌ Please provide an OpenAI API key"
19
-
20
- # try:
21
- # # Set the API key in environment
22
- # os.environ["OPENAI_API_KEY"] = api_key
23
- # current_api_key = api_key
24
-
25
- # # Initialize components
26
- # db_path = "./db"
27
- # os.makedirs(db_path, exist_ok=True)
28
- # collection_name = "my_collection"
29
-
30
- # chroma_collection = ChromaCollection(collection_name, db_path, api_key)
31
- # chroma_uploader = ChromaUploader(collection_name, db_path, api_key)
32
-
33
- # return "βœ… ChromaDB components initialized successfully!"
34
-
35
- # except Exception as e:
36
- # return f"❌ Error initializing components: {str(e)}"
37
-
38
- # def query_documents(api_key, query):
39
- # """Query the document collection"""
40
- # global chroma_collection
41
-
42
- # if not api_key:
43
- # return "❌ Please provide an OpenAI API key"
44
-
45
- # if not query.strip():
46
- # return "❌ Please enter a query"
47
-
48
- # # Validate API key format
49
- # if not api_key.startswith("sk-") or len(api_key) < 20:
50
- # return "❌ Invalid OpenAI API key format. It should start with 'sk-' and be longer than 20 characters."
51
-
52
- # # Initialize or check if we need to reinitialize
53
- # if chroma_collection is None or current_api_key != api_key:
54
- # init_msg = initialize_chroma_components(api_key)
55
- # if "Error" in init_msg:
56
- # return init_msg
57
-
58
- # try:
59
- # # Query the collection with fixed n_results=5
60
- # results = chroma_collection.query_collection([query], n_results=5)
61
-
62
- # if not results['documents'][0]:
63
- # return """❌ No documents found in the collection.
64
-
65
- # πŸ“š **Next steps:**
66
- # 1. Go to the "πŸ“„ Upload Documents" tab
67
- # 2. Upload some PDF files first
68
- # 3. Come back and ask your question"""
69
-
70
- # # Generate answer
71
- # answer = chroma_collection.generate_answer(query, results)
72
-
73
- # # Check if answer indicates an error
74
- # if answer.startswith("Error generating answer"):
75
- # return f"""❌ Error generating answer: {answer}
76
-
77
- # πŸ” **Troubleshooting:**
78
- # - Check your internet connection
79
- # - Verify your OpenAI API key has credits
80
- # - Try a simpler question
81
- # - Wait a moment and try again"""
82
-
83
- # # Count documents for context
84
- # try:
85
- # doc_count = chroma_collection.get_collection_count()
86
- # context_info = f"\n\n---\n*Answer based on {len(results['documents'][0])} relevant chunks from {doc_count} total documents*"
87
- # except:
88
- # context_info = f"\n\n---\n*Answer based on {len(results['documents'][0])} relevant document chunks*"
89
-
90
- # return f"πŸ€– **Answer:**\n\n{answer}{context_info}"
91
-
92
- # except Exception as e:
93
- # error_msg = str(e).lower()
94
- # if "connection" in error_msg or "timeout" in error_msg:
95
- # return f"""❌ Connection error: {str(e)}
96
-
97
- # πŸ” **Troubleshooting:**
98
- # - Check your internet connection
99
- # - Verify OpenAI API is accessible
100
- # - Try again in a few moments"""
101
- # elif "api" in error_msg and "key" in error_msg:
102
- # return f"""❌ API key error: {str(e)}
103
-
104
- # πŸ”‘ **Please check:**
105
- # - Your API key is correct
106
- # - Your OpenAI account has sufficient credits
107
- # - The API key has the necessary permissions"""
108
- # else:
109
- # return f"❌ Error querying documents: {str(e)}"
110
-
111
- # def upload_pdf(api_key, pdf_file):
112
- # """Upload and process PDF file"""
113
- # global chroma_uploader
114
-
115
- # if not api_key:
116
- # return "❌ Please provide an OpenAI API key"
117
-
118
- # if pdf_file is None:
119
- # return "❌ Please upload a PDF file"
120
-
121
- # # Validate API key format
122
- # if not api_key.startswith("sk-") or len(api_key) < 20:
123
- # return "❌ Invalid OpenAI API key format. It should start with 'sk-' and be longer than 20 characters."
124
-
125
- # # Initialize or check if we need to reinitialize
126
- # if chroma_uploader is None or current_api_key != api_key:
127
- # init_msg = initialize_chroma_components(api_key)
128
- # if "Error" in init_msg:
129
- # return init_msg
130
-
131
- # try:
132
- # # Read the PDF file
133
- # with open(pdf_file.name, 'rb') as file:
134
- # pdf_bytes = file.read()
135
-
136
- # # Extract text from PDF
137
- # pdf_text, pdf_lines = chroma_uploader.extract_text_from_pdf_bytes(pdf_bytes)
138
-
139
- # if not pdf_text or not pdf_lines:
140
- # return "❌ Could not extract text from the PDF file. Make sure it's a text-based PDF (not scanned images)."
141
-
142
- # # Add documents to ChromaDB with better feedback
143
- # print(f"Processing {len(pdf_lines)} document chunks...")
144
- # success = chroma_uploader.add_documents(pdf_lines)
145
-
146
- # if success:
147
- # # Get updated count
148
- # try:
149
- # count = chroma_uploader.get_collection_count()
150
- # return f"βœ… Successfully processed PDF!\n\nπŸ“Š Added document chunks from '{os.path.basename(pdf_file.name)}'\nπŸ—ƒοΈ Total documents in collection: {count}"
151
- # except:
152
- # return f"βœ… Successfully processed and added document chunks from '{os.path.basename(pdf_file.name)}'!"
153
- # else:
154
- # return """❌ Failed to add documents to ChromaDB.
155
-
156
- # πŸ” **Troubleshooting tips:**
157
- # - Check your internet connection
158
- # - Verify your OpenAI API key has credits
159
- # - Try uploading a smaller PDF file
160
- # - Wait a moment and try again (rate limits)"""
161
-
162
- # except Exception as e:
163
- # error_msg = str(e).lower()
164
- # if "connection" in error_msg or "timeout" in error_msg:
165
- # return f"""❌ Connection error occurred: {str(e)}
166
-
167
- # πŸ” **Troubleshooting:**
168
- # - Check your internet connection
169
- # - Verify OpenAI API is accessible
170
- # - Try again in a few moments
171
- # - If on Hugging Face, the service might be temporarily overloaded"""
172
- # elif "api" in error_msg and "key" in error_msg:
173
- # return f"""❌ API key error: {str(e)}
174
-
175
- # πŸ”‘ **Please check:**
176
- # - Your API key is correct and starts with 'sk-'
177
- # - Your OpenAI account has sufficient credits
178
- # - The API key has the necessary permissions"""
179
- # else:
180
- # return f"❌ Error processing PDF: {str(e)}"
181
-
182
- # def test_api_key(api_key):
183
- # """Test if the API key is working"""
184
- # if not api_key:
185
- # return "❌ Please provide an OpenAI API key"
186
-
187
- # if not api_key.startswith("sk-") or len(api_key) < 20:
188
- # return "❌ Invalid API key format. OpenAI keys should start with 'sk-' and be longer than 20 characters."
189
-
190
- # try:
191
- # from openai import OpenAI
192
- # client = OpenAI(api_key=api_key)
193
-
194
- # # Test with a simple API call
195
- # response = client.chat.completions.create(
196
- # model="gpt-4o-mini",
197
- # messages=[{"role": "user", "content": "Hello"}],
198
- # max_tokens=5
199
- # )
200
-
201
- # return "βœ… API key is working! You can now upload documents and ask questions."
202
-
203
- # except Exception as e:
204
- # error_msg = str(e).lower()
205
- # if "api" in error_msg and "key" in error_msg:
206
- # return f"❌ API key error: Invalid or expired API key. Please check your key and account credits."
207
- # elif "quota" in error_msg or "limit" in error_msg:
208
- # return f"❌ Quota/rate limit error: Your API key has reached its limit or you're out of credits."
209
- # elif "connection" in error_msg or "timeout" in error_msg:
210
- # return f"❌ Connection error: Unable to reach OpenAI API. Check your internet connection."
211
- # else:
212
- # return f"❌ Error testing API key: {str(e)}"
213
-
214
- # # def get_collection_info(api_key):
215
- # # """Get information about the current collection"""
216
- # # global chroma_uploader
217
-
218
- # # if not api_key:
219
- # # return "❌ Please provide an OpenAI API key"
220
-
221
- # # if chroma_uploader is None or current_api_key != api_key:
222
- # # init_msg = initialize_chroma_components(api_key)
223
- # # if "Error" in init_msg:
224
- # # return init_msg
225
-
226
- # # try:
227
- # # count = chroma_uploader.get_collection_count()
228
- # # if count == 0:
229
- # # return """πŸ“Š Collection is empty
230
-
231
- # # πŸš€ **Get started:**
232
- # # 1. Upload PDF files using the upload section above
233
- # # 2. Documents will be processed and stored automatically
234
- # # 3. Then you can ask questions about your documents"""
235
- # # else:
236
- # # return f"""πŸ“Š Collection Status:
237
-
238
- # # πŸ—ƒοΈ **Total documents:** {count} chunks
239
- # # βœ… **Status:** Ready for questions
240
- # # πŸ” **You can now:** Ask questions about your uploaded documents"""
241
- # # except Exception as e:
242
- # # return f"❌ Error getting collection info: {str(e)}"
243
-
244
- # # Create Gradio interface
245
- # def create_interface():
246
- # with gr.Blocks(title="CV Document Q&A System", theme=gr.themes.Soft()) as demo:
247
- # gr.Markdown(
248
- # """
249
- # # πŸ“š CV Document Q&A System
250
-
251
- # Upload the CV and ask questions about its content using AI-powered search and retrieval.
252
-
253
- # **⚠️ Important:** You need to provide your own OpenAI API key to use this application.
254
- # """
255
- # )
256
-
257
- # # API Key input (will be hidden)
258
- # with gr.Row():
259
- # with gr.Column(scale=4):
260
- # api_key_input = gr.Textbox(
261
- # label="πŸ”‘ OpenAI API Key",
262
- # placeholder="Enter your OpenAI API key (sk-...)",
263
- # type="password",
264
- # info="Your API key is not stored and is only used for this session"
265
- # )
266
- # with gr.Column(scale=1):
267
- # test_key_button = gr.Button("πŸ§ͺ Test API Key", variant="secondary")
268
-
269
- # api_test_output = gr.Markdown(label="API Key Status")
270
-
271
- # test_key_button.click(
272
- # test_api_key,
273
- # inputs=[api_key_input],
274
- # outputs=api_test_output
275
- # )
276
-
277
- # with gr.Tabs():
278
- # # Upload Tab (now first)
279
- # with gr.Tab("πŸ“„ Upload Documents"):
280
- # gr.Markdown("### Upload PDF documents to your knowledge base")
281
-
282
- # pdf_upload = gr.File(
283
- # label="Upload PDF File",
284
- # file_types=[".pdf"],
285
- # type="filepath"
286
- # )
287
-
288
- # upload_button = gr.Button("πŸ“ Process PDF", variant="primary")
289
- # upload_output = gr.Markdown(label="Upload Status")
290
-
291
- # upload_button.click(
292
- # upload_pdf,
293
- # inputs=[api_key_input, pdf_upload],
294
- # outputs=upload_output
295
- # )
296
-
297
- # # Collection info
298
- # # info_button = gr.Button("πŸ“Š Check Collection Status")
299
- # # info_output = gr.Markdown(label="Collection Information")
300
-
301
- # # info_button.click(
302
- # # get_collection_info,
303
- # # inputs=[api_key_input],
304
- # # outputs=info_output
305
- # # )
306
-
307
- # # Q&A Tab (now second)
308
- # with gr.Tab("πŸ€– Ask Questions"):
309
- # gr.Markdown("### Ask questions about your uploaded documents")
310
-
311
- # query_input = gr.Textbox(
312
- # label="Your Question",
313
- # placeholder="Ask me anything about your documents...",
314
- # lines=3
315
- # )
316
-
317
- # query_button = gr.Button("πŸ” Get Answer", variant="primary")
318
- # query_output = gr.Markdown(label="Answer")
319
-
320
- # query_button.click(
321
- # query_documents,
322
- # inputs=[api_key_input, query_input],
323
- # outputs=query_output
324
- # )
325
-
326
- # # Instructions
327
- # with gr.Accordion("πŸ“– How to Use & Troubleshooting", open=False):
328
- # gr.Markdown(
329
- # """
330
- # ### Instructions:
331
-
332
- # 1. **Enter your OpenAI API Key** - Get one from [OpenAI's website](https://platform.openai.com/api-keys)
333
- # 2. **Test your API Key** - Click "πŸ§ͺ Test API Key" to verify it's working
334
- # 3. **Upload PDF Documents** - Go to the "Upload Documents" tab and upload your PDF files.
335
- # 4. **Ask Questions** - Switch to the "Ask Questions" tab and query your documents
336
-
337
- # """
338
-
339
-
340
- # )
341
-
342
- # return demo
343
-
344
- # # Launch the application
345
- # if __name__ == "__main__":
346
- # demo = create_interface()
347
- # demo.launch(
348
- # server_name="0.0.0.0",
349
- # server_port=7860,
350
- # share=False # Set to True to create a public link
351
- # )
352
-
353
-
354
  import gradio as gr
355
  import os
356
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import os
3
  import tempfile
chromadb_upload.py CHANGED
@@ -1,287 +1,4 @@
1
 
2
- # import chromadb
3
- # import PyPDF2
4
- # import time
5
- # import chromadb.utils.embedding_functions as embedding_functions
6
- # import os
7
- # import io
8
-
9
- # class ChromaUploader:
10
- # def __init__(self, collection_name, db_path, api_key=None):
11
- # # Initialize Chroma persistent client and collection name
12
- # self.chroma_client = chromadb.PersistentClient(path=db_path)
13
- # self.collection_name = collection_name
14
- # self.collection = None
15
-
16
- # # Use provided API key or fall back to environment variable
17
- # self.openai_key = api_key or os.getenv("OPENAI_API_KEY")
18
-
19
- # if not self.openai_key:
20
- # raise ValueError("OpenAI API key is required")
21
-
22
- # self.openai_ef = embedding_functions.OpenAIEmbeddingFunction(
23
- # api_key=self.openai_key,
24
- # model_name="text-embedding-ada-002"
25
- # )
26
-
27
- # self._initialize_collection()
28
-
29
- # def _initialize_collection(self):
30
- # """
31
- # Initializes the collection if it doesn't exist.
32
- # """
33
- # try:
34
- # self.collection = self.chroma_client.get_collection(
35
- # name=self.collection_name,
36
- # embedding_function=self.openai_ef
37
- # )
38
- # print(f"Collection '{self.collection_name}' already exists.")
39
- # except Exception as e:
40
- # # If collection doesn't exist, create a new one
41
- # self.collection = self.chroma_client.create_collection(
42
- # name=self.collection_name,
43
- # embedding_function=self.openai_ef
44
- # )
45
- # print(f"Created new collection '{self.collection_name}'.")
46
-
47
- # def add_documents(self, documents):
48
- # """
49
- # Adds documents to the collection with retry mechanism and better error handling.
50
- # :param documents: List of document strings to be added
51
- # """
52
- # if documents is None or len(documents) == 0:
53
- # print("No data collected from the document to add.")
54
- # return False
55
-
56
- # try:
57
- # # Create unique IDs for each document chunk
58
- # timestamp = int(time.time() * 1000000) # microseconds for uniqueness
59
- # ids = [f"doc_{timestamp}_{i}" for i in range(len(documents))]
60
-
61
- # # Filter out empty documents
62
- # valid_documents = []
63
- # valid_ids = []
64
-
65
- # for i, doc in enumerate(documents):
66
- # if doc and doc.strip() and len(doc.strip()) > 10: # Only add non-empty docs with some content
67
- # valid_documents.append(doc.strip())
68
- # valid_ids.append(ids[i])
69
-
70
- # if not valid_documents:
71
- # print("No valid documents to add after filtering.")
72
- # return False
73
-
74
- # print(f"Attempting to add {len(valid_documents)} documents to collection...")
75
-
76
- # # Add documents to collection in smaller batches with retry
77
- # batch_size = 20 # Reduced batch size to avoid connection issues
78
- # total_added = 0
79
-
80
- # for i in range(0, len(valid_documents), batch_size):
81
- # batch_docs = valid_documents[i:i + batch_size]
82
- # batch_ids = valid_ids[i:i + batch_size]
83
-
84
- # success = self._add_batch_with_retry(batch_docs, batch_ids, max_retries=3)
85
- # if success:
86
- # total_added += len(batch_docs)
87
- # print(f"Successfully added batch {i//batch_size + 1}, total: {total_added}/{len(valid_documents)}")
88
- # else:
89
- # print(f"Failed to add batch {i//batch_size + 1} after retries")
90
- # # Continue with next batch instead of failing completely
91
-
92
- # if total_added > 0:
93
- # print(f"Successfully added {total_added} out of {len(valid_documents)} documents to collection '{self.collection_name}'.")
94
- # return True
95
- # else:
96
- # print("Failed to add any documents to the collection.")
97
- # return False
98
-
99
- # except Exception as e:
100
- # print(f"Error in add_documents: {e}")
101
- # return False
102
-
103
- # def _add_batch_with_retry(self, batch_docs, batch_ids, max_retries=3):
104
- # """
105
- # Add a batch of documents with retry mechanism
106
- # """
107
- # import time
108
-
109
- # for attempt in range(max_retries):
110
- # try:
111
- # print(f"Attempt {attempt + 1}/{max_retries} for batch of {len(batch_docs)} documents...")
112
-
113
- # self.collection.add(
114
- # documents=batch_docs,
115
- # ids=batch_ids
116
- # )
117
- # return True
118
-
119
- # except Exception as e:
120
- # error_msg = str(e).lower()
121
- # print(f"Attempt {attempt + 1} failed: {e}")
122
-
123
- # if "connection" in error_msg or "timeout" in error_msg or "rate" in error_msg:
124
- # # Network or rate limit issue - wait before retry
125
- # wait_time = (attempt + 1) * 2 # Exponential backoff
126
- # print(f"Connection/rate limit issue detected. Waiting {wait_time} seconds before retry...")
127
- # time.sleep(wait_time)
128
- # elif "api" in error_msg and "key" in error_msg:
129
- # # API key issue - no point in retrying
130
- # print("API key issue detected. Cannot retry.")
131
- # return False
132
- # else:
133
- # # Other error - short wait before retry
134
- # time.sleep(1)
135
-
136
- # if attempt == max_retries - 1:
137
- # print(f"All {max_retries} attempts failed for this batch.")
138
- # return False
139
-
140
- # return False
141
-
142
- # def extract_text_from_pdf_bytes(self, pdf_bytes):
143
- # """
144
- # Extracts text from a PDF file from bytes (for Gradio uploaded files).
145
- # :param pdf_bytes: PDF file as bytes
146
- # :return: Extracted text from the PDF and the lines as a list
147
- # """
148
- # try:
149
- # # Create a file-like object from bytes
150
- # pdf_file = io.BytesIO(pdf_bytes)
151
-
152
- # # Create a PDF reader object
153
- # pdf_reader = PyPDF2.PdfReader(pdf_file)
154
-
155
- # # Initialize an empty string to store extracted text
156
- # text = ""
157
-
158
- # # Extract text from each page
159
- # for page_num, page in enumerate(pdf_reader.pages):
160
- # try:
161
- # # Extract text from the page
162
- # page_text = page.extract_text()
163
-
164
- # # Clean up the extracted text
165
- # cleaned_text = self._clean_extracted_text(page_text)
166
-
167
- # if cleaned_text.strip(): # Only add non-empty pages
168
- # # Append to the total text with page marker
169
- # text += f"\n--- Page {page_num + 1} ---\n{cleaned_text}\n"
170
-
171
- # except Exception as e:
172
- # print(f"Error extracting text from page {page_num + 1}: {e}")
173
- # continue
174
-
175
- # if not text.strip():
176
- # return "", []
177
-
178
- # # Split text into meaningful chunks
179
- # chunks = self._split_text_into_chunks(text, max_chunk_size=1000, overlap=100)
180
-
181
- # return text.strip(), chunks
182
-
183
- # except Exception as e:
184
- # print(f"Error extracting text from PDF: {e}")
185
- # return "", []
186
-
187
- # def extract_text_from_pdf(self, pdf_path):
188
- # """
189
- # Extracts text from a PDF file using PyPDF2 with improved text extraction.
190
- # :param pdf_path: Path to the PDF file
191
- # :return: Extracted text from the PDF and the lines as a list
192
- # """
193
- # try:
194
- # # Open the PDF file
195
- # with open(pdf_path, 'rb') as file:
196
- # pdf_bytes = file.read()
197
- # return self.extract_text_from_pdf_bytes(pdf_bytes)
198
-
199
- # except Exception as e:
200
- # print(f"Error extracting text from PDF: {e}")
201
- # return "", []
202
-
203
- # def _clean_extracted_text(self, text):
204
- # """
205
- # Clean up extracted text to improve readability and remove unnecessary whitespace.
206
- # :param text: Raw extracted text
207
- # :return: Cleaned text
208
- # """
209
- # if not text:
210
- # return ""
211
-
212
- # # Remove excessive whitespace and clean up
213
- # lines = []
214
- # for line in text.split('\n'):
215
- # cleaned_line = line.strip()
216
- # if cleaned_line and len(cleaned_line) > 2: # Filter out very short lines
217
- # lines.append(cleaned_line)
218
-
219
- # # Join lines with proper spacing
220
- # cleaned_text = ' '.join(lines)
221
-
222
- # # Remove multiple spaces
223
- # while ' ' in cleaned_text:
224
- # cleaned_text = cleaned_text.replace(' ', ' ')
225
-
226
- # return cleaned_text
227
-
228
- # def _split_text_into_chunks(self, text, max_chunk_size=1000, overlap=100):
229
- # """
230
- # Split text into overlapping chunks for better context preservation.
231
- # :param text: Text to split
232
- # :param max_chunk_size: Maximum size of each chunk
233
- # :param overlap: Number of characters to overlap between chunks
234
- # :return: List of text chunks
235
- # """
236
- # if not text:
237
- # return []
238
-
239
- # chunks = []
240
- # start = 0
241
-
242
- # while start < len(text):
243
- # # Calculate end position
244
- # end = start + max_chunk_size
245
-
246
- # # If we're not at the end of the text, try to end at a sentence boundary
247
- # if end < len(text):
248
- # # Look for sentence endings within the last 200 characters
249
- # search_start = max(end - 200, start)
250
- # sentence_endings = ['. ', '! ', '? ', '\n\n']
251
-
252
- # best_end = end
253
- # for ending in sentence_endings:
254
- # pos = text.rfind(ending, search_start, end)
255
- # if pos > start:
256
- # best_end = pos + len(ending)
257
- # break
258
-
259
- # end = best_end
260
-
261
- # # Extract chunk
262
- # chunk = text[start:end].strip()
263
-
264
- # if chunk and len(chunk) > 50: # Only add substantial chunks
265
- # chunks.append(chunk)
266
-
267
- # # Move start position with overlap
268
- # start = max(start + 1, end - overlap)
269
-
270
- # # Safety check to prevent infinite loops
271
- # if start >= len(text):
272
- # break
273
-
274
- # return chunks
275
-
276
- # def get_collection_count(self):
277
- # """
278
- # Get the number of documents in the collection.
279
- # """
280
- # try:
281
- # return self.collection.count()
282
- # except Exception as e:
283
- # print(f"Error getting collection count: {e}")
284
- # return 0
285
 
286
 
287
  import chromadb
 
1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  import chromadb