dure-waseem commited on
Commit
b1c00a1
Β·
1 Parent(s): 1ad4324

initial code

Browse files
Files changed (3) hide show
  1. app.py +38 -462
  2. chromadb_query.py +0 -118
  3. chromadb_upload.py +0 -232
app.py CHANGED
@@ -1,396 +1,3 @@
1
-
2
- # import gradio as gr
3
- # import os
4
- # import tempfile
5
- # import shutil
6
- # from chromadb_query import ChromaCollection
7
- # from chromadb_upload import ChromaUploader
8
-
9
- # # Global variables to store instances
10
- # chroma_collection = None
11
- # chroma_uploader = None
12
- # current_api_key = None
13
-
14
- # def initialize_chroma_components(api_key):
15
- # """Initialize ChromaDB components with the provided API key"""
16
- # global chroma_collection, chroma_uploader, current_api_key
17
-
18
- # if not api_key:
19
- # return "❌ Please provide an OpenAI API key"
20
-
21
- # try:
22
- # # Set the API key in environment
23
- # os.environ["OPENAI_API_KEY"] = api_key
24
- # current_api_key = api_key
25
-
26
- # # Initialize components
27
- # db_path = "./db"
28
- # os.makedirs(db_path, exist_ok=True)
29
- # collection_name = "my_collection"
30
-
31
- # chroma_collection = ChromaCollection(collection_name, db_path, api_key)
32
- # chroma_uploader = ChromaUploader(collection_name, db_path, api_key)
33
-
34
- # return "βœ… ChromaDB components initialized successfully!"
35
-
36
- # except Exception as e:
37
- # return f"❌ Error initializing components: {str(e)}"
38
-
39
- # def query_documents(api_key, query, n_results):
40
- # """Query the document collection"""
41
- # global chroma_collection
42
-
43
- # if not api_key:
44
- # return "❌ Please provide an OpenAI API key"
45
-
46
- # if not query.strip():
47
- # return "❌ Please enter a query"
48
-
49
- # # Validate API key format
50
- # if not api_key.startswith("sk-") or len(api_key) < 20:
51
- # return "❌ Invalid OpenAI API key format. It should start with 'sk-' and be longer than 20 characters."
52
-
53
- # # Initialize or check if we need to reinitialize
54
- # if chroma_collection is None or current_api_key != api_key:
55
- # init_msg = initialize_chroma_components(api_key)
56
- # if "Error" in init_msg:
57
- # return init_msg
58
-
59
- # try:
60
- # # Query the collection
61
- # results = chroma_collection.query_collection([query], n_results=n_results)
62
-
63
- # if not results['documents'][0]:
64
- # return """❌ No documents found in the collection.
65
-
66
- # πŸ“š **Next steps:**
67
- # 1. Go to the "πŸ“„ Upload Documents" tab
68
- # 2. Upload some PDF files first
69
- # 3. Come back and ask your question"""
70
-
71
- # # Generate answer
72
- # answer = chroma_collection.generate_answer(query, results)
73
-
74
- # # Check if answer indicates an error
75
- # if answer.startswith("Error generating answer"):
76
- # return f"""❌ Error generating answer: {answer}
77
-
78
- # πŸ” **Troubleshooting:**
79
- # - Check your internet connection
80
- # - Verify your OpenAI API key has credits
81
- # - Try a simpler question
82
- # - Wait a moment and try again"""
83
-
84
- # # Count documents for context
85
- # try:
86
- # doc_count = chroma_collection.get_collection_count()
87
- # context_info = f"\n\n---\n*Answer based on {len(results['documents'][0])} relevant chunks from {doc_count} total documents*"
88
- # except:
89
- # context_info = f"\n\n---\n*Answer based on {len(results['documents'][0])} relevant document chunks*"
90
-
91
- # return f"πŸ€– **Answer:**\n\n{answer}{context_info}"
92
-
93
- # except Exception as e:
94
- # error_msg = str(e).lower()
95
- # if "connection" in error_msg or "timeout" in error_msg:
96
- # return f"""❌ Connection error: {str(e)}
97
-
98
- # πŸ” **Troubleshooting:**
99
- # - Check your internet connection
100
- # - Verify OpenAI API is accessible
101
- # - Try again in a few moments"""
102
- # elif "api" in error_msg and "key" in error_msg:
103
- # return f"""❌ API key error: {str(e)}
104
-
105
- # πŸ”‘ **Please check:**
106
- # - Your API key is correct
107
- # - Your OpenAI account has sufficient credits
108
- # - The API key has the necessary permissions"""
109
- # else:
110
- # return f"❌ Error querying documents: {str(e)}"
111
-
112
- # def upload_pdf(api_key, pdf_file):
113
- # """Upload and process PDF file"""
114
- # global chroma_uploader
115
-
116
- # if not api_key:
117
- # return "❌ Please provide an OpenAI API key"
118
-
119
- # if pdf_file is None:
120
- # return "❌ Please upload a PDF file"
121
-
122
- # # Validate API key format
123
- # if not api_key.startswith("sk-") or len(api_key) < 20:
124
- # return "❌ Invalid OpenAI API key format. It should start with 'sk-' and be longer than 20 characters."
125
-
126
- # # Initialize or check if we need to reinitialize
127
- # if chroma_uploader is None or current_api_key != api_key:
128
- # init_msg = initialize_chroma_components(api_key)
129
- # if "Error" in init_msg:
130
- # return init_msg
131
-
132
- # try:
133
- # # Read the PDF file
134
- # with open(pdf_file.name, 'rb') as file:
135
- # pdf_bytes = file.read()
136
-
137
- # # Extract text from PDF
138
- # pdf_text, pdf_lines = chroma_uploader.extract_text_from_pdf_bytes(pdf_bytes)
139
-
140
- # if not pdf_text or not pdf_lines:
141
- # return "❌ Could not extract text from the PDF file. Make sure it's a text-based PDF (not scanned images)."
142
-
143
- # # Add documents to ChromaDB with better feedback
144
- # print(f"Processing {len(pdf_lines)} document chunks...")
145
- # success = chroma_uploader.add_documents(pdf_lines)
146
-
147
- # if success:
148
- # # Get updated count
149
- # try:
150
- # count = chroma_uploader.get_collection_count()
151
- # return f"βœ… Successfully processed PDF!\n\nπŸ“Š Added document chunks from '{os.path.basename(pdf_file.name)}'\nπŸ—ƒοΈ Total documents in collection: {count}"
152
- # except:
153
- # return f"βœ… Successfully processed and added document chunks from '{os.path.basename(pdf_file.name)}'!"
154
- # else:
155
- # return """❌ Failed to add documents to ChromaDB.
156
-
157
- # πŸ” **Troubleshooting tips:**
158
- # - Check your internet connection
159
- # - Verify your OpenAI API key has credits
160
- # - Try uploading a smaller PDF file
161
- # - Wait a moment and try again (rate limits)"""
162
-
163
- # except Exception as e:
164
- # error_msg = str(e).lower()
165
- # if "connection" in error_msg or "timeout" in error_msg:
166
- # return f"""❌ Connection error occurred: {str(e)}
167
-
168
- # πŸ” **Troubleshooting:**
169
- # - Check your internet connection
170
- # - Verify OpenAI API is accessible
171
- # - Try again in a few moments
172
- # - If on Hugging Face, the service might be temporarily overloaded"""
173
- # elif "api" in error_msg and "key" in error_msg:
174
- # return f"""❌ API key error: {str(e)}
175
-
176
- # πŸ”‘ **Please check:**
177
- # - Your API key is correct and starts with 'sk-'
178
- # - Your OpenAI account has sufficient credits
179
- # - The API key has the necessary permissions"""
180
- # else:
181
- # return f"❌ Error processing PDF: {str(e)}"
182
-
183
- # def test_api_key(api_key):
184
- # """Test if the API key is working"""
185
- # if not api_key:
186
- # return "❌ Please provide an OpenAI API key"
187
-
188
- # if not api_key.startswith("sk-") or len(api_key) < 20:
189
- # return "❌ Invalid API key format. OpenAI keys should start with 'sk-' and be longer than 20 characters."
190
-
191
- # try:
192
- # from openai import OpenAI
193
- # client = OpenAI(api_key=api_key)
194
-
195
- # # Test with a simple API call
196
- # response = client.chat.completions.create(
197
- # model="gpt-4o-mini",
198
- # messages=[{"role": "user", "content": "Hello"}],
199
- # max_tokens=5
200
- # )
201
-
202
- # return "βœ… API key is working! You can now upload documents and ask questions."
203
-
204
- # except Exception as e:
205
- # error_msg = str(e).lower()
206
- # if "api" in error_msg and "key" in error_msg:
207
- # return f"❌ API key error: Invalid or expired API key. Please check your key and account credits."
208
- # elif "quota" in error_msg or "limit" in error_msg:
209
- # return f"❌ Quota/rate limit error: Your API key has reached its limit or you're out of credits."
210
- # elif "connection" in error_msg or "timeout" in error_msg:
211
- # return f"❌ Connection error: Unable to reach OpenAI API. Check your internet connection."
212
- # else:
213
- # return f"❌ Error testing API key: {str(e)}"
214
-
215
- # def get_collection_info(api_key):
216
- # """Get information about the current collection"""
217
- # global chroma_uploader
218
-
219
- # if not api_key:
220
- # return "❌ Please provide an OpenAI API key"
221
-
222
- # if chroma_uploader is None or current_api_key != api_key:
223
- # init_msg = initialize_chroma_components(api_key)
224
- # if "Error" in init_msg:
225
- # return init_msg
226
-
227
- # try:
228
- # count = chroma_uploader.get_collection_count()
229
- # if count == 0:
230
- # return """πŸ“Š Collection is empty
231
-
232
- # πŸš€ **Get started:**
233
- # 1. Upload PDF files using the upload section above
234
- # 2. Documents will be processed and stored automatically
235
- # 3. Then you can ask questions about your documents"""
236
- # else:
237
- # return f"""πŸ“Š Collection Status:
238
-
239
- # πŸ—ƒοΈ **Total documents:** {count} chunks
240
- # βœ… **Status:** Ready for questions
241
- # πŸ” **You can now:** Ask questions about your uploaded documents"""
242
- # except Exception as e:
243
- # return f"❌ Error getting collection info: {str(e)}"
244
-
245
- # # Create Gradio interface
246
- # def create_interface():
247
- # with gr.Blocks(title="CV-Info-Agent", theme=gr.themes.Soft()) as demo:
248
- # gr.Markdown(
249
- # """
250
- # # πŸ“š ChromaDB Q&A System
251
-
252
- # Upload PDF documents and ask questions about their content using AI-powered search and retrieval.
253
-
254
- # **⚠️ Important:** You need to provide your own OpenAI API key to use this application.
255
- # """
256
- # )
257
-
258
- # # API Key input (will be hidden)
259
- # with gr.Row():
260
- # with gr.Column(scale=4):
261
- # api_key_input = gr.Textbox(
262
- # label="πŸ”‘ OpenAI API Key",
263
- # placeholder="Enter your OpenAI API key (sk-...)",
264
- # type="password",
265
- # info="Your API key is not stored and is only used for this session"
266
- # )
267
- # with gr.Column(scale=1):
268
- # test_key_button = gr.Button("πŸ§ͺ Test API Key", variant="secondary")
269
-
270
- # api_test_output = gr.Markdown(label="API Key Status")
271
-
272
- # test_key_button.click(
273
- # test_api_key,
274
- # inputs=[api_key_input],
275
- # outputs=api_test_output
276
- # )
277
-
278
- # with gr.Tabs():
279
- # # Q&A Tab
280
- # with gr.Tab("πŸ€– Ask Questions"):
281
- # gr.Markdown("### Ask questions about your uploaded documents")
282
-
283
- # with gr.Row():
284
- # with gr.Column(scale=3):
285
- # query_input = gr.Textbox(
286
- # label="Your Question",
287
- # placeholder="Ask me anything about your documents...",
288
- # lines=3
289
- # )
290
- # with gr.Column(scale=1):
291
- # n_results_slider = gr.Slider(
292
- # minimum=1,
293
- # maximum=20,
294
- # value=10,
295
- # step=1,
296
- # label="Max Results"
297
- # )
298
-
299
- # query_button = gr.Button("πŸ” Get Answer", variant="primary")
300
- # query_output = gr.Markdown(label="Answer")
301
-
302
- # query_button.click(
303
- # query_documents,
304
- # inputs=[api_key_input, query_input, n_results_slider],
305
- # outputs=query_output
306
- # )
307
-
308
- # # Upload Tab
309
- # with gr.Tab("πŸ“„ Upload Documents"):
310
- # gr.Markdown("### Upload PDF documents to your knowledge base")
311
-
312
- # pdf_upload = gr.File(
313
- # label="Upload PDF File",
314
- # file_types=[".pdf"],
315
- # type="filepath"
316
- # )
317
-
318
- # upload_button = gr.Button("πŸ“ Process PDF", variant="primary")
319
- # upload_output = gr.Markdown(label="Upload Status")
320
-
321
- # upload_button.click(
322
- # upload_pdf,
323
- # inputs=[api_key_input, pdf_upload],
324
- # outputs=upload_output
325
- # )
326
-
327
- # # Collection info
328
- # info_button = gr.Button("πŸ“Š Check Collection Status")
329
- # info_output = gr.Markdown(label="Collection Information")
330
-
331
- # info_button.click(
332
- # get_collection_info,
333
- # inputs=[api_key_input],
334
- # outputs=info_output
335
- # )
336
-
337
- # # Instructions
338
- # with gr.Accordion("πŸ“– How to Use & Troubleshooting", open=False):
339
- # gr.Markdown(
340
- # """
341
- # ### Instructions:
342
-
343
- # 1. **Enter your OpenAI API Key** - Get one from [OpenAI's website](https://platform.openai.com/api-keys)
344
- # 2. **Test your API Key** - Click "πŸ§ͺ Test API Key" to verify it's working
345
- # 3. **Upload PDF Documents** - Go to the "Upload Documents" tab and upload your PDF files
346
- # 4. **Ask Questions** - Switch to the "Ask Questions" tab and query your documents
347
-
348
- # ### 🚨 Troubleshooting Connection Errors:
349
-
350
- # **"Connection error" when uploading documents:**
351
- # - βœ… Check your internet connection
352
- # - βœ… Verify your OpenAI API key has sufficient credits
353
- # - βœ… Wait 30 seconds and try again (rate limits)
354
- # - βœ… Try uploading smaller PDF files
355
- # - βœ… If on Hugging Face Spaces, the service might be temporarily overloaded
356
-
357
- # **API Key Issues:**
358
- # - βœ… Make sure your key starts with `sk-`
359
- # - βœ… Check your OpenAI account has credits
360
- # - βœ… Verify the key has proper permissions
361
- # - βœ… Test your key using the "πŸ§ͺ Test API Key" button
362
-
363
- # **PDF Upload Issues:**
364
- # - βœ… Ensure PDF contains text (not just images)
365
- # - βœ… Try smaller PDF files (under 10MB)
366
- # - βœ… Check PDF isn't password protected
367
-
368
- # ### Features:
369
- # - πŸ”’ **Secure**: Your API key is not stored permanently
370
- # - πŸ“š **Multiple Documents**: Upload multiple PDFs to build your knowledge base
371
- # - 🎯 **Accurate Answers**: Get AI-powered answers based on your document content
372
- # - ⚑ **Fast Search**: Vector-based similarity search for relevant content
373
- # - πŸ”„ **Retry Logic**: Automatic retry for connection issues
374
-
375
- # ### Notes:
376
- # - PDF text extraction works with most standard PDF formats
377
- # - Documents are stored locally during your session
378
- # - Each document is chunked for better search performance
379
- # - The system uses OpenAI's text-embedding-ada-002 for embeddings
380
- # - Answers are generated using GPT-4o-mini model
381
- # """
382
- # )
383
-
384
- # return demo
385
-
386
- # # Launch the application
387
- # if __name__ == "__main__":
388
- # demo = create_interface()
389
- # demo.launch(
390
- # server_name="0.0.0.0",
391
- # server_port=7860,
392
- # share=True # Set to True to create a public link
393
- # )
394
  import gradio as gr
395
  import os
396
  import tempfile
@@ -604,45 +211,45 @@ def test_api_key(api_key):
604
  else:
605
  return f"❌ Error testing API key: {str(e)}"
606
 
607
- def get_collection_info(api_key):
608
- """Get information about the current collection"""
609
- global chroma_uploader
610
 
611
- if not api_key:
612
- return "❌ Please provide an OpenAI API key"
613
 
614
- if chroma_uploader is None or current_api_key != api_key:
615
- init_msg = initialize_chroma_components(api_key)
616
- if "Error" in init_msg:
617
- return init_msg
618
 
619
- try:
620
- count = chroma_uploader.get_collection_count()
621
- if count == 0:
622
- return """πŸ“Š Collection is empty
623
 
624
- πŸš€ **Get started:**
625
- 1. Upload PDF files using the upload section above
626
- 2. Documents will be processed and stored automatically
627
- 3. Then you can ask questions about your documents"""
628
- else:
629
- return f"""πŸ“Š Collection Status:
630
 
631
- πŸ—ƒοΈ **Total documents:** {count} chunks
632
- βœ… **Status:** Ready for questions
633
- πŸ” **You can now:** Ask questions about your uploaded documents"""
634
- except Exception as e:
635
- return f"❌ Error getting collection info: {str(e)}"
636
 
637
  # Create Gradio interface
638
  def create_interface():
639
- with gr.Blocks(title="ChromaDB Q&A System", theme=gr.themes.Soft()) as demo:
640
  gr.Markdown(
641
  """
642
- # πŸ“š ChromaDB Q&A System
643
-
644
- Upload PDF documents and ask questions about their content using AI-powered search and retrieval.
645
-
646
  **⚠️ Important:** You need to provide your own OpenAI API key to use this application.
647
  """
648
  )
@@ -688,14 +295,14 @@ def create_interface():
688
  )
689
 
690
  # Collection info
691
- info_button = gr.Button("πŸ“Š Check Collection Status")
692
- info_output = gr.Markdown(label="Collection Information")
693
 
694
- info_button.click(
695
- get_collection_info,
696
- inputs=[api_key_input],
697
- outputs=info_output
698
- )
699
 
700
  # Q&A Tab (now second)
701
  with gr.Tab("πŸ€– Ask Questions"):
@@ -724,43 +331,12 @@ def create_interface():
724
 
725
  1. **Enter your OpenAI API Key** - Get one from [OpenAI's website](https://platform.openai.com/api-keys)
726
  2. **Test your API Key** - Click "πŸ§ͺ Test API Key" to verify it's working
727
- 3. **Upload PDF Documents** - Go to the "Upload Documents" tab and upload your PDF files
728
  4. **Ask Questions** - Switch to the "Ask Questions" tab and query your documents
729
 
730
- ### 🚨 Troubleshooting Connection Errors:
731
-
732
- **"Connection error" when uploading documents:**
733
- - βœ… Check your internet connection
734
- - βœ… Verify your OpenAI API key has sufficient credits
735
- - βœ… Wait 30 seconds and try again (rate limits)
736
- - βœ… Try uploading smaller PDF files
737
- - βœ… If on Hugging Face Spaces, the service might be temporarily overloaded
738
-
739
- **API Key Issues:**
740
- - βœ… Make sure your key starts with `sk-`
741
- - βœ… Check your OpenAI account has credits
742
- - βœ… Verify the key has proper permissions
743
- - βœ… Test your key using the "πŸ§ͺ Test API Key" button
744
 
745
- **PDF Upload Issues:**
746
- - βœ… Ensure PDF contains text (not just images)
747
- - βœ… Try smaller PDF files (under 10MB)
748
- - βœ… Check PDF isn't password protected
749
 
750
- ### Features:
751
- - πŸ”’ **Secure**: Your API key is not stored permanently
752
- - πŸ“š **Multiple Documents**: Upload multiple PDFs to build your knowledge base
753
- - 🎯 **Accurate Answers**: Get AI-powered answers based on your document content
754
- - ⚑ **Fast Search**: Vector-based similarity search for relevant content
755
- - πŸ”„ **Retry Logic**: Automatic retry for connection issues
756
-
757
- ### Notes:
758
- - PDF text extraction works with most standard PDF formats
759
- - Documents are stored locally during your session
760
- - Each document is chunked for better search performance
761
- - The system uses OpenAI's text-embedding-ada-002 for embeddings
762
- - Answers are generated using GPT-4o-mini model
763
- """
764
  )
765
 
766
  return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import os
3
  import tempfile
 
211
  else:
212
  return f"❌ Error testing API key: {str(e)}"
213
 
214
+ # def get_collection_info(api_key):
215
+ # """Get information about the current collection"""
216
+ # global chroma_uploader
217
 
218
+ # if not api_key:
219
+ # return "❌ Please provide an OpenAI API key"
220
 
221
+ # if chroma_uploader is None or current_api_key != api_key:
222
+ # init_msg = initialize_chroma_components(api_key)
223
+ # if "Error" in init_msg:
224
+ # return init_msg
225
 
226
+ # try:
227
+ # count = chroma_uploader.get_collection_count()
228
+ # if count == 0:
229
+ # return """πŸ“Š Collection is empty
230
 
231
+ # πŸš€ **Get started:**
232
+ # 1. Upload PDF files using the upload section above
233
+ # 2. Documents will be processed and stored automatically
234
+ # 3. Then you can ask questions about your documents"""
235
+ # else:
236
+ # return f"""πŸ“Š Collection Status:
237
 
238
+ # πŸ—ƒοΈ **Total documents:** {count} chunks
239
+ # βœ… **Status:** Ready for questions
240
+ # πŸ” **You can now:** Ask questions about your uploaded documents"""
241
+ # except Exception as e:
242
+ # return f"❌ Error getting collection info: {str(e)}"
243
 
244
  # Create Gradio interface
245
  def create_interface():
246
+ with gr.Blocks(title="CV Document Q&A System", theme=gr.themes.Soft()) as demo:
247
  gr.Markdown(
248
  """
249
+ # πŸ“š CV Document Q&A System
250
+
251
+ Upload the CV and ask questions about its content using AI-powered search and retrieval.
252
+
253
  **⚠️ Important:** You need to provide your own OpenAI API key to use this application.
254
  """
255
  )
 
295
  )
296
 
297
  # Collection info
298
+ # info_button = gr.Button("πŸ“Š Check Collection Status")
299
+ # info_output = gr.Markdown(label="Collection Information")
300
 
301
+ # info_button.click(
302
+ # get_collection_info,
303
+ # inputs=[api_key_input],
304
+ # outputs=info_output
305
+ # )
306
 
307
  # Q&A Tab (now second)
308
  with gr.Tab("πŸ€– Ask Questions"):
 
331
 
332
  1. **Enter your OpenAI API Key** - Get one from [OpenAI's website](https://platform.openai.com/api-keys)
333
  2. **Test your API Key** - Click "πŸ§ͺ Test API Key" to verify it's working
334
+ 3. **Upload PDF Documents** - Go to the "Upload Documents" tab and upload your PDF files.
335
  4. **Ask Questions** - Switch to the "Ask Questions" tab and query your documents
336
 
337
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
 
 
 
 
339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  )
341
 
342
  return demo
chromadb_query.py CHANGED
@@ -1,122 +1,4 @@
1
- # import chromadb
2
- # import time
3
- # import chromadb.utils.embedding_functions as embedding_functions
4
- # import os
5
- # from openai import OpenAI
6
 
7
- # class ChromaCollection:
8
- # def __init__(self, collection_name, db_path, api_key=None):
9
- # # Initialize Chroma persistent client and collection name
10
- # self.chroma_client = chromadb.PersistentClient(path=db_path)
11
- # self.collection_name = collection_name
12
- # self.collection = None
13
-
14
- # # Use provided API key or fall back to environment variable
15
- # self.openai_key = api_key or os.getenv("OPENAI_API_KEY")
16
-
17
- # if not self.openai_key:
18
- # raise ValueError("OpenAI API key is required")
19
-
20
- # self.openai_ef = embedding_functions.OpenAIEmbeddingFunction(
21
- # api_key=self.openai_key,
22
- # model_name="text-embedding-ada-002"
23
- # )
24
-
25
- # # Initialize OpenAI client
26
- # self.openai_client = OpenAI(api_key=self.openai_key)
27
- # self._initialize_collection()
28
-
29
- # def _initialize_collection(self):
30
- # """
31
- # Initializes the collection if it doesn't exist.
32
- # """
33
- # try:
34
- # self.collection = self.chroma_client.get_collection(
35
- # name=self.collection_name,
36
- # embedding_function=self.openai_ef
37
- # )
38
- # print(f"Collection '{self.collection_name}' already exists.")
39
- # except Exception as e:
40
- # # If collection doesn't exist, create a new one
41
- # self.collection = self.chroma_client.create_collection(
42
- # name=self.collection_name,
43
- # embedding_function=self.openai_ef
44
- # )
45
- # print(f"Created new collection '{self.collection_name}'.")
46
-
47
- # def query_collection(self, query_texts, n_results=1):
48
- # """
49
- # Queries the collection with the given text and returns the results.
50
- # :param query_texts: List of query strings
51
- # :param n_results: Number of results to return
52
- # :return: Query results
53
- # """
54
- # try:
55
- # results = self.collection.query(
56
- # query_texts=query_texts, # Chroma will embed this for you
57
- # n_results=n_results # How many results to return
58
- # )
59
- # return results
60
- # except Exception as e:
61
- # print(f"Error querying collection: {e}")
62
- # return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
63
-
64
- # def generate_answer(self, query, results):
65
- # """
66
- # Takes the query and ChromaDB results and generates an accurate answer using the LLM.
67
- # :param query: User's query
68
- # :param results: ChromaDB results
69
- # :return: Generated answer from LLM
70
- # """
71
- # # Check if we have any results
72
- # if not results['documents'][0]:
73
- # return "No relevant documents found to answer your question."
74
-
75
- # # Prepare the context for LLM by appending the query and results
76
- # documents_text = "\n".join(results['documents'][0][:5]) # Use top 5 results
77
-
78
- # context = f"""Based on the following context from the documents, please answer the user's question accurately and concisely.
79
-
80
- # Context from documents:
81
- # {documents_text}
82
-
83
- # User's question: {query}
84
-
85
- # Please provide a clear and accurate answer based only on the information provided in the context above."""
86
-
87
- # try:
88
- # # Use the new OpenAI API format
89
- # response = self.openai_client.chat.completions.create(
90
- # model="gpt-4o-mini",
91
- # messages=[
92
- # {
93
- # "role": "system",
94
- # "content": "You are a helpful assistant that answers questions based on provided document context. Only use information from the provided context to answer questions."
95
- # },
96
- # {
97
- # "role": "user",
98
- # "content": context
99
- # }
100
- # ],
101
- # max_tokens=500,
102
- # temperature=0.1
103
- # )
104
-
105
- # # Extract and return the answer from the response
106
- # return response.choices[0].message.content.strip()
107
-
108
- # except Exception as e:
109
- # return f"Error generating answer: {str(e)}"
110
-
111
- # def get_collection_count(self):
112
- # """
113
- # Get the number of documents in the collection.
114
- # """
115
- # try:
116
- # return self.collection.count()
117
- # except Exception as e:
118
- # print(f"Error getting collection count: {e}")
119
- # return 0
120
  import chromadb
121
  import time
122
  import chromadb.utils.embedding_functions as embedding_functions
 
 
 
 
 
 
1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import chromadb
3
  import time
4
  import chromadb.utils.embedding_functions as embedding_functions
chromadb_upload.py CHANGED
@@ -1,236 +1,4 @@
1
- # import chromadb
2
- # import PyPDF2
3
- # import time
4
- # import chromadb.utils.embedding_functions as embedding_functions
5
- # import os
6
- # import io
7
 
8
- # class ChromaUploader:
9
- # def __init__(self, collection_name, db_path, api_key=None):
10
- # # Initialize Chroma persistent client and collection name
11
- # self.chroma_client = chromadb.PersistentClient(path=db_path)
12
- # self.collection_name = collection_name
13
- # self.collection = None
14
-
15
- # # Use provided API key or fall back to environment variable
16
- # self.openai_key = api_key or os.getenv("OPENAI_API_KEY")
17
-
18
- # if not self.openai_key:
19
- # raise ValueError("OpenAI API key is required")
20
-
21
- # self.openai_ef = embedding_functions.OpenAIEmbeddingFunction(
22
- # api_key=self.openai_key,
23
- # model_name="text-embedding-ada-002"
24
- # )
25
-
26
- # self._initialize_collection()
27
-
28
- # def _initialize_collection(self):
29
- # """
30
- # Initializes the collection if it doesn't exist.
31
- # """
32
- # try:
33
- # self.collection = self.chroma_client.get_collection(
34
- # name=self.collection_name,
35
- # embedding_function=self.openai_ef
36
- # )
37
- # print(f"Collection '{self.collection_name}' already exists.")
38
- # except Exception as e:
39
- # # If collection doesn't exist, create a new one
40
- # self.collection = self.chroma_client.create_collection(
41
- # name=self.collection_name,
42
- # embedding_function=self.openai_ef
43
- # )
44
- # print(f"Created new collection '{self.collection_name}'.")
45
-
46
- # def add_documents(self, documents):
47
- # """
48
- # Adds documents to the collection, ensuring no duplicate IDs.
49
- # :param documents: List of document strings to be added
50
- # """
51
- # if documents is None or len(documents) == 0:
52
- # print("No data collected from the document to add.")
53
- # return False
54
-
55
- # try:
56
- # # Create unique IDs for each document chunk
57
- # timestamp = int(time.time() * 1000000) # microseconds for uniqueness
58
- # ids = [f"doc_{timestamp}_{i}" for i in range(len(documents))]
59
-
60
- # # Filter out empty documents
61
- # valid_documents = []
62
- # valid_ids = []
63
-
64
- # for i, doc in enumerate(documents):
65
- # if doc and doc.strip() and len(doc.strip()) > 10: # Only add non-empty docs with some content
66
- # valid_documents.append(doc.strip())
67
- # valid_ids.append(ids[i])
68
-
69
- # if not valid_documents:
70
- # print("No valid documents to add after filtering.")
71
- # return False
72
-
73
- # # Add documents to collection in batches to avoid memory issues
74
- # batch_size = 100
75
- # for i in range(0, len(valid_documents), batch_size):
76
- # batch_docs = valid_documents[i:i + batch_size]
77
- # batch_ids = valid_ids[i:i + batch_size]
78
-
79
- # self.collection.add(
80
- # documents=batch_docs,
81
- # ids=batch_ids
82
- # )
83
-
84
- # print(f"Added {len(valid_documents)} documents to collection '{self.collection_name}'.")
85
- # return True
86
-
87
- # except Exception as e:
88
- # print(f"Error adding documents to collection: {e}")
89
- # return False
90
-
91
- # def extract_text_from_pdf_bytes(self, pdf_bytes):
92
- # """
93
- # Extracts text from a PDF file from bytes (for Gradio uploaded files).
94
- # :param pdf_bytes: PDF file as bytes
95
- # :return: Extracted text from the PDF and the lines as a list
96
- # """
97
- # try:
98
- # # Create a file-like object from bytes
99
- # pdf_file = io.BytesIO(pdf_bytes)
100
-
101
- # # Create a PDF reader object
102
- # pdf_reader = PyPDF2.PdfReader(pdf_file)
103
-
104
- # # Initialize an empty string to store extracted text
105
- # text = ""
106
-
107
- # # Extract text from each page
108
- # for page_num, page in enumerate(pdf_reader.pages):
109
- # try:
110
- # # Extract text from the page
111
- # page_text = page.extract_text()
112
-
113
- # # Clean up the extracted text
114
- # cleaned_text = self._clean_extracted_text(page_text)
115
-
116
- # if cleaned_text.strip(): # Only add non-empty pages
117
- # # Append to the total text with page marker
118
- # text += f"\n--- Page {page_num + 1} ---\n{cleaned_text}\n"
119
-
120
- # except Exception as e:
121
- # print(f"Error extracting text from page {page_num + 1}: {e}")
122
- # continue
123
-
124
- # if not text.strip():
125
- # return "", []
126
-
127
- # # Split text into meaningful chunks
128
- # chunks = self._split_text_into_chunks(text, max_chunk_size=1000, overlap=100)
129
-
130
- # return text.strip(), chunks
131
-
132
- # except Exception as e:
133
- # print(f"Error extracting text from PDF: {e}")
134
- # return "", []
135
-
136
- # def extract_text_from_pdf(self, pdf_path):
137
- # """
138
- # Extracts text from a PDF file using PyPDF2 with improved text extraction.
139
- # :param pdf_path: Path to the PDF file
140
- # :return: Extracted text from the PDF and the lines as a list
141
- # """
142
- # try:
143
- # # Open the PDF file
144
- # with open(pdf_path, 'rb') as file:
145
- # pdf_bytes = file.read()
146
- # return self.extract_text_from_pdf_bytes(pdf_bytes)
147
-
148
- # except Exception as e:
149
- # print(f"Error extracting text from PDF: {e}")
150
- # return "", []
151
-
152
- # def _clean_extracted_text(self, text):
153
- # """
154
- # Clean up extracted text to improve readability and remove unnecessary whitespace.
155
- # :param text: Raw extracted text
156
- # :return: Cleaned text
157
- # """
158
- # if not text:
159
- # return ""
160
-
161
- # # Remove excessive whitespace and clean up
162
- # lines = []
163
- # for line in text.split('\n'):
164
- # cleaned_line = line.strip()
165
- # if cleaned_line and len(cleaned_line) > 2: # Filter out very short lines
166
- # lines.append(cleaned_line)
167
-
168
- # # Join lines with proper spacing
169
- # cleaned_text = ' '.join(lines)
170
-
171
- # # Remove multiple spaces
172
- # while ' ' in cleaned_text:
173
- # cleaned_text = cleaned_text.replace(' ', ' ')
174
-
175
- # return cleaned_text
176
-
177
- # def _split_text_into_chunks(self, text, max_chunk_size=1000, overlap=100):
178
- # """
179
- # Split text into overlapping chunks for better context preservation.
180
- # :param text: Text to split
181
- # :param max_chunk_size: Maximum size of each chunk
182
- # :param overlap: Number of characters to overlap between chunks
183
- # :return: List of text chunks
184
- # """
185
- # if not text:
186
- # return []
187
-
188
- # chunks = []
189
- # start = 0
190
-
191
- # while start < len(text):
192
- # # Calculate end position
193
- # end = start + max_chunk_size
194
-
195
- # # If we're not at the end of the text, try to end at a sentence boundary
196
- # if end < len(text):
197
- # # Look for sentence endings within the last 200 characters
198
- # search_start = max(end - 200, start)
199
- # sentence_endings = ['. ', '! ', '? ', '\n\n']
200
-
201
- # best_end = end
202
- # for ending in sentence_endings:
203
- # pos = text.rfind(ending, search_start, end)
204
- # if pos > start:
205
- # best_end = pos + len(ending)
206
- # break
207
-
208
- # end = best_end
209
-
210
- # # Extract chunk
211
- # chunk = text[start:end].strip()
212
-
213
- # if chunk and len(chunk) > 50: # Only add substantial chunks
214
- # chunks.append(chunk)
215
-
216
- # # Move start position with overlap
217
- # start = max(start + 1, end - overlap)
218
-
219
- # # Safety check to prevent infinite loops
220
- # if start >= len(text):
221
- # break
222
-
223
- # return chunks
224
-
225
- # def get_collection_count(self):
226
- # """
227
- # Get the number of documents in the collection.
228
- # """
229
- # try:
230
- # return self.collection.count()
231
- # except Exception as e:
232
- # print(f"Error getting collection count: {e}")
233
- # return 0
234
  import chromadb
235
  import PyPDF2
236
  import time
 
 
 
 
 
 
 
1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import chromadb
3
  import PyPDF2
4
  import time