Spaces:

BluescarfAI
/

CV-Info-Agent

Sleeping

App Files Files Community

dure-waseem commited on Jul 18, 2025

Commit

b1c00a1

1 Parent(s): 1ad4324

initial code

Browse files

Files changed (3) hide show

app.py +38 -462
chromadb_query.py +0 -118
chromadb_upload.py +0 -232

app.py CHANGED Viewed

@@ -1,396 +1,3 @@
-# import gradio as gr
-# import os
-# import tempfile
-# import shutil
-# from chromadb_query import ChromaCollection
-# from chromadb_upload import ChromaUploader
-# # Global variables to store instances
-# chroma_collection = None
-# chroma_uploader = None
-# current_api_key = None
-# def initialize_chroma_components(api_key):
-#     """Initialize ChromaDB components with the provided API key"""
-#     global chroma_collection, chroma_uploader, current_api_key
-#     if not api_key:
-#         return "❌ Please provide an OpenAI API key"
-#     try:
-#         # Set the API key in environment
-#         os.environ["OPENAI_API_KEY"] = api_key
-#         current_api_key = api_key
-#         # Initialize components
-#         db_path = "./db"
-#         os.makedirs(db_path, exist_ok=True)
-#         collection_name = "my_collection"
-#         chroma_collection = ChromaCollection(collection_name, db_path, api_key)
-#         chroma_uploader = ChromaUploader(collection_name, db_path, api_key)
-#         return "✅ ChromaDB components initialized successfully!"
-#     except Exception as e:
-#         return f"❌ Error initializing components: {str(e)}"
-# def query_documents(api_key, query, n_results):
-#     """Query the document collection"""
-#     global chroma_collection
-#     if not api_key:
-#         return "❌ Please provide an OpenAI API key"
-#     if not query.strip():
-#         return "❌ Please enter a query"
-#     # Validate API key format
-#     if not api_key.startswith("sk-") or len(api_key) < 20:
-#         return "❌ Invalid OpenAI API key format. It should start with 'sk-' and be longer than 20 characters."
-#     # Initialize or check if we need to reinitialize
-#     if chroma_collection is None or current_api_key != api_key:
-#         init_msg = initialize_chroma_components(api_key)
-#         if "Error" in init_msg:
-#             return init_msg
-#     try:
-#         # Query the collection
-#         results = chroma_collection.query_collection([query], n_results=n_results)
-#         if not results['documents'][0]:
-#             return """❌ No documents found in the collection.
-# 📚 **Next steps:**
-# 1. Go to the "📄 Upload Documents" tab
-# 2. Upload some PDF files first
-# 3. Come back and ask your question"""
-#         # Generate answer
-#         answer = chroma_collection.generate_answer(query, results)
-#         # Check if answer indicates an error
-#         if answer.startswith("Error generating answer"):
-#             return f"""❌ Error generating answer: {answer}
-# 🔍 **Troubleshooting:**
-# - Check your internet connection
-# - Verify your OpenAI API key has credits
-# - Try a simpler question
-# - Wait a moment and try again"""
-#         # Count documents for context
-#         try:
-#             doc_count = chroma_collection.get_collection_count()
-#             context_info = f"\n\n---\n*Answer based on {len(results['documents'][0])} relevant chunks from {doc_count} total documents*"
-#         except:
-#             context_info = f"\n\n---\n*Answer based on {len(results['documents'][0])} relevant document chunks*"
-#         return f"🤖 **Answer:**\n\n{answer}{context_info}"
-#     except Exception as e:
-#         error_msg = str(e).lower()
-#         if "connection" in error_msg or "timeout" in error_msg:
-#             return f"""❌ Connection error: {str(e)}
-# 🔍 **Troubleshooting:**
-# - Check your internet connection
-# - Verify OpenAI API is accessible
-# - Try again in a few moments"""
-#         elif "api" in error_msg and "key" in error_msg:
-#             return f"""❌ API key error: {str(e)}
-# 🔑 **Please check:**
-# - Your API key is correct
-# - Your OpenAI account has sufficient credits
-# - The API key has the necessary permissions"""
-#         else:
-#             return f"❌ Error querying documents: {str(e)}"
-# def upload_pdf(api_key, pdf_file):
-#     """Upload and process PDF file"""
-#     global chroma_uploader
-#     if not api_key:
-#         return "❌ Please provide an OpenAI API key"
-#     if pdf_file is None:
-#         return "❌ Please upload a PDF file"
-#     # Validate API key format
-#     if not api_key.startswith("sk-") or len(api_key) < 20:
-#         return "❌ Invalid OpenAI API key format. It should start with 'sk-' and be longer than 20 characters."
-#     # Initialize or check if we need to reinitialize
-#     if chroma_uploader is None or current_api_key != api_key:
-#         init_msg = initialize_chroma_components(api_key)
-#         if "Error" in init_msg:
-#             return init_msg
-#     try:
-#         # Read the PDF file
-#         with open(pdf_file.name, 'rb') as file:
-#             pdf_bytes = file.read()
-#         # Extract text from PDF
-#         pdf_text, pdf_lines = chroma_uploader.extract_text_from_pdf_bytes(pdf_bytes)
-#         if not pdf_text or not pdf_lines:
-#             return "❌ Could not extract text from the PDF file. Make sure it's a text-based PDF (not scanned images)."
-#         # Add documents to ChromaDB with better feedback
-#         print(f"Processing {len(pdf_lines)} document chunks...")
-#         success = chroma_uploader.add_documents(pdf_lines)
-#         if success:
-#             # Get updated count
-#             try:
-#                 count = chroma_uploader.get_collection_count()
-#                 return f"✅ Successfully processed PDF!\n\n📊 Added document chunks from '{os.path.basename(pdf_file.name)}'\n🗃️ Total documents in collection: {count}"
-#             except:
-#                 return f"✅ Successfully processed and added document chunks from '{os.path.basename(pdf_file.name)}'!"
-#         else:
-#             return """❌ Failed to add documents to ChromaDB.
-# 🔍 **Troubleshooting tips:**
-# - Check your internet connection
-# - Verify your OpenAI API key has credits
-# - Try uploading a smaller PDF file
-# - Wait a moment and try again (rate limits)"""
-#     except Exception as e:
-#         error_msg = str(e).lower()
-#         if "connection" in error_msg or "timeout" in error_msg:
-#             return f"""❌ Connection error occurred: {str(e)}
-# 🔍 **Troubleshooting:**
-# - Check your internet connection
-# - Verify OpenAI API is accessible
-# - Try again in a few moments
-# - If on Hugging Face, the service might be temporarily overloaded"""
-#         elif "api" in error_msg and "key" in error_msg:
-#             return f"""❌ API key error: {str(e)}
-# 🔑 **Please check:**
-# - Your API key is correct and starts with 'sk-'
-# - Your OpenAI account has sufficient credits
-# - The API key has the necessary permissions"""
-#         else:
-#             return f"❌ Error processing PDF: {str(e)}"
-# def test_api_key(api_key):
-#     """Test if the API key is working"""
-#     if not api_key:
-#         return "❌ Please provide an OpenAI API key"
-#     if not api_key.startswith("sk-") or len(api_key) < 20:
-#         return "❌ Invalid API key format. OpenAI keys should start with 'sk-' and be longer than 20 characters."
-#     try:
-#         from openai import OpenAI
-#         client = OpenAI(api_key=api_key)
-#         # Test with a simple API call
-#         response = client.chat.completions.create(
-#             model="gpt-4o-mini",
-#             messages=[{"role": "user", "content": "Hello"}],
-#             max_tokens=5
-#         )
-#         return "✅ API key is working! You can now upload documents and ask questions."
-#     except Exception as e:
-#         error_msg = str(e).lower()
-#         if "api" in error_msg and "key" in error_msg:
-#             return f"❌ API key error: Invalid or expired API key. Please check your key and account credits."
-#         elif "quota" in error_msg or "limit" in error_msg:
-#             return f"❌ Quota/rate limit error: Your API key has reached its limit or you're out of credits."
-#         elif "connection" in error_msg or "timeout" in error_msg:
-#             return f"❌ Connection error: Unable to reach OpenAI API. Check your internet connection."
-#         else:
-#             return f"❌ Error testing API key: {str(e)}"
-# def get_collection_info(api_key):
-#     """Get information about the current collection"""
-#     global chroma_uploader
-#     if not api_key:
-#         return "❌ Please provide an OpenAI API key"
-#     if chroma_uploader is None or current_api_key != api_key:
-#         init_msg = initialize_chroma_components(api_key)
-#         if "Error" in init_msg:
-#             return init_msg
-#     try:
-#         count = chroma_uploader.get_collection_count()
-#         if count == 0:
-#             return """📊 Collection is empty
-# 🚀 **Get started:**
-# 1. Upload PDF files using the upload section above
-# 2. Documents will be processed and stored automatically
-# 3. Then you can ask questions about your documents"""
-#         else:
-#             return f"""📊 Collection Status:
-# 🗃️ **Total documents:** {count} chunks
-# ✅ **Status:** Ready for questions
-# 🔍 **You can now:** Ask questions about your uploaded documents"""
-#     except Exception as e:
-#         return f"❌ Error getting collection info: {str(e)}"
-# # Create Gradio interface
-# def create_interface():
-#     with gr.Blocks(title="CV-Info-Agent", theme=gr.themes.Soft()) as demo:
-#         gr.Markdown(
-#             """
-#             # 📚 ChromaDB Q&A System
-#             Upload PDF documents and ask questions about their content using AI-powered search and retrieval.
-#             **⚠️ Important:** You need to provide your own OpenAI API key to use this application.
-#             """
-#         )
-#         # API Key input (will be hidden)
-#         with gr.Row():
-#             with gr.Column(scale=4):
-#                 api_key_input = gr.Textbox(
-#                     label="🔑 OpenAI API Key",
-#                     placeholder="Enter your OpenAI API key (sk-...)",
-#                     type="password",
-#                     info="Your API key is not stored and is only used for this session"
-#                 )
-#             with gr.Column(scale=1):
-#                 test_key_button = gr.Button("🧪 Test API Key", variant="secondary")
-#         api_test_output = gr.Markdown(label="API Key Status")
-#         test_key_button.click(
-#             test_api_key,
-#             inputs=[api_key_input],
-#             outputs=api_test_output
-#         )
-#         with gr.Tabs():
-#             # Q&A Tab
-#             with gr.Tab("🤖 Ask Questions"):
-#                 gr.Markdown("### Ask questions about your uploaded documents")
-#                 with gr.Row():
-#                     with gr.Column(scale=3):
-#                         query_input = gr.Textbox(
-#                             label="Your Question",
-#                             placeholder="Ask me anything about your documents...",
-#                             lines=3
-#                         )
-#                     with gr.Column(scale=1):
-#                         n_results_slider = gr.Slider(
-#                             minimum=1,
-#                             maximum=20,
-#                             value=10,
-#                             step=1,
-#                             label="Max Results"
-#                         )
-#                 query_button = gr.Button("🔍 Get Answer", variant="primary")
-#                 query_output = gr.Markdown(label="Answer")
-#                 query_button.click(
-#                     query_documents,
-#                     inputs=[api_key_input, query_input, n_results_slider],
-#                     outputs=query_output
-#                 )
-#             # Upload Tab
-#             with gr.Tab("📄 Upload Documents"):
-#                 gr.Markdown("### Upload PDF documents to your knowledge base")
-#                 pdf_upload = gr.File(
-#                     label="Upload PDF File",
-#                     file_types=[".pdf"],
-#                     type="filepath"
-#                 )
-#                 upload_button = gr.Button("📁 Process PDF", variant="primary")
-#                 upload_output = gr.Markdown(label="Upload Status")
-#                 upload_button.click(
-#                     upload_pdf,
-#                     inputs=[api_key_input, pdf_upload],
-#                     outputs=upload_output
-#                 )
-#                 # Collection info
-#                 info_button = gr.Button("📊 Check Collection Status")
-#                 info_output = gr.Markdown(label="Collection Information")
-#                 info_button.click(
-#                     get_collection_info,
-#                     inputs=[api_key_input],
-#                     outputs=info_output
-#                 )
-#         # Instructions
-#         with gr.Accordion("📖 How to Use & Troubleshooting", open=False):
-#             gr.Markdown(
-#                 """
-#                 ### Instructions:
-#                 1. **Enter your OpenAI API Key** - Get one from [OpenAI's website](https://platform.openai.com/api-keys)
-#                 2. **Test your API Key** - Click "🧪 Test API Key" to verify it's working
-#                 3. **Upload PDF Documents** - Go to the "Upload Documents" tab and upload your PDF files
-#                 4. **Ask Questions** - Switch to the "Ask Questions" tab and query your documents
-#                 ### 🚨 Troubleshooting Connection Errors:
-#                 **"Connection error" when uploading documents:**
-#                 - ✅ Check your internet connection
-#                 - ✅ Verify your OpenAI API key has sufficient credits
-#                 - ✅ Wait 30 seconds and try again (rate limits)
-#                 - ✅ Try uploading smaller PDF files
-#                 - ✅ If on Hugging Face Spaces, the service might be temporarily overloaded
-#                 **API Key Issues:**
-#                 - ✅ Make sure your key starts with `sk-`
-#                 - ✅ Check your OpenAI account has credits
-#                 - ✅ Verify the key has proper permissions
-#                 - ✅ Test your key using the "🧪 Test API Key" button
-#                 **PDF Upload Issues:**
-#                 - ✅ Ensure PDF contains text (not just images)
-#                 - ✅ Try smaller PDF files (under 10MB)
-#                 - ✅ Check PDF isn't password protected
-#                 ### Features:
-#                 - 🔒 **Secure**: Your API key is not stored permanently
-#                 - 📚 **Multiple Documents**: Upload multiple PDFs to build your knowledge base
-#                 - 🎯 **Accurate Answers**: Get AI-powered answers based on your document content
-#                 - ⚡ **Fast Search**: Vector-based similarity search for relevant content
-#                 - 🔄 **Retry Logic**: Automatic retry for connection issues
-#                 ### Notes:
-#                 - PDF text extraction works with most standard PDF formats
-#                 - Documents are stored locally during your session
-#                 - Each document is chunked for better search performance
-#                 - The system uses OpenAI's text-embedding-ada-002 for embeddings
-#                 - Answers are generated using GPT-4o-mini model
-#                 """
-#             )
-#     return demo
-# # Launch the application
-# if __name__ == "__main__":
-#     demo = create_interface()
-#     demo.launch(
-#         server_name="0.0.0.0",
-#         server_port=7860,
-#         share=True  # Set to True to create a public link
-#     )
 import gradio as gr
 import os
 import tempfile
@@ -604,45 +211,45 @@ def test_api_key(api_key):
         else:
             return f"❌ Error testing API key: {str(e)}"
-def get_collection_info(api_key):
-    """Get information about the current collection"""
-    global chroma_uploader
-    if not api_key:
-        return "❌ Please provide an OpenAI API key"
-    if chroma_uploader is None or current_api_key != api_key:
-        init_msg = initialize_chroma_components(api_key)
-        if "Error" in init_msg:
-            return init_msg
-    try:
-        count = chroma_uploader.get_collection_count()
-        if count == 0:
-            return """📊 Collection is empty
-🚀 **Get started:**
-1. Upload PDF files using the upload section above
-2. Documents will be processed and stored automatically
-3. Then you can ask questions about your documents"""
-        else:
-            return f"""📊 Collection Status:
-🗃️ **Total documents:** {count} chunks
-✅ **Status:** Ready for questions
-🔍 **You can now:** Ask questions about your uploaded documents"""
-    except Exception as e:
-        return f"❌ Error getting collection info: {str(e)}"
 # Create Gradio interface
 def create_interface():
-    with gr.Blocks(title="ChromaDB Q&A System", theme=gr.themes.Soft()) as demo:
         gr.Markdown(
             """
-            # 📚 ChromaDB Q&A System
-            Upload PDF documents and ask questions about their content using AI-powered search and retrieval.
             **⚠️ Important:** You need to provide your own OpenAI API key to use this application.
             """
         )
@@ -688,14 +295,14 @@ def create_interface():
                 )
                 # Collection info
-                info_button = gr.Button("📊 Check Collection Status")
-                info_output = gr.Markdown(label="Collection Information")
-                info_button.click(
-                    get_collection_info,
-                    inputs=[api_key_input],
-                    outputs=info_output
-                )
             # Q&A Tab (now second)
             with gr.Tab("🤖 Ask Questions"):
@@ -724,43 +331,12 @@ def create_interface():
                 1. **Enter your OpenAI API Key** - Get one from [OpenAI's website](https://platform.openai.com/api-keys)
                 2. **Test your API Key** - Click "🧪 Test API Key" to verify it's working
-                3. **Upload PDF Documents** - Go to the "Upload Documents" tab and upload your PDF files
                 4. **Ask Questions** - Switch to the "Ask Questions" tab and query your documents
-                ### 🚨 Troubleshooting Connection Errors:
-                **"Connection error" when uploading documents:**
-                - ✅ Check your internet connection
-                - ✅ Verify your OpenAI API key has sufficient credits
-                - ✅ Wait 30 seconds and try again (rate limits)
-                - ✅ Try uploading smaller PDF files
-                - ✅ If on Hugging Face Spaces, the service might be temporarily overloaded
-                **API Key Issues:**
-                - ✅ Make sure your key starts with `sk-`
-                - ✅ Check your OpenAI account has credits
-                - ✅ Verify the key has proper permissions
-                - ✅ Test your key using the "🧪 Test API Key" button
-                **PDF Upload Issues:**
-                - ✅ Ensure PDF contains text (not just images)
-                - ✅ Try smaller PDF files (under 10MB)
-                - ✅ Check PDF isn't password protected
-                ### Features:
-                - 🔒 **Secure**: Your API key is not stored permanently
-                - 📚 **Multiple Documents**: Upload multiple PDFs to build your knowledge base
-                - 🎯 **Accurate Answers**: Get AI-powered answers based on your document content
-                - ⚡ **Fast Search**: Vector-based similarity search for relevant content
-                - 🔄 **Retry Logic**: Automatic retry for connection issues
-                ### Notes:
-                - PDF text extraction works with most standard PDF formats
-                - Documents are stored locally during your session
-                - Each document is chunked for better search performance
-                - The system uses OpenAI's text-embedding-ada-002 for embeddings
-                - Answers are generated using GPT-4o-mini model
-                """
             )
     return demo

 import gradio as gr
 import os
 import tempfile
         else:
             return f"❌ Error testing API key: {str(e)}"
+# def get_collection_info(api_key):
+#     """Get information about the current collection"""
+#     global chroma_uploader
+#     if not api_key:
+#         return "❌ Please provide an OpenAI API key"
+#     if chroma_uploader is None or current_api_key != api_key:
+#         init_msg = initialize_chroma_components(api_key)
+#         if "Error" in init_msg:
+#             return init_msg
+#     try:
+#         count = chroma_uploader.get_collection_count()
+#         if count == 0:
+#             return """📊 Collection is empty
+# 🚀 **Get started:**
+# 1. Upload PDF files using the upload section above
+# 2. Documents will be processed and stored automatically
+# 3. Then you can ask questions about your documents"""
+#         else:
+#             return f"""📊 Collection Status:
+# 🗃️ **Total documents:** {count} chunks
+# ✅ **Status:** Ready for questions
+# 🔍 **You can now:** Ask questions about your uploaded documents"""
+#     except Exception as e:
+#         return f"❌ Error getting collection info: {str(e)}"
 # Create Gradio interface
 def create_interface():
+    with gr.Blocks(title="CV Document Q&A System", theme=gr.themes.Soft()) as demo:
         gr.Markdown(
             """
+            # 📚 CV Document Q&A System
+            Upload the CV and ask questions about its content using AI-powered search and retrieval.
             **⚠️ Important:** You need to provide your own OpenAI API key to use this application.
             """
         )
                 )
                 # Collection info
+                # info_button = gr.Button("📊 Check Collection Status")
+                # info_output = gr.Markdown(label="Collection Information")
+                # info_button.click(
+                #     get_collection_info,
+                #     inputs=[api_key_input],
+                #     outputs=info_output
+                # )
             # Q&A Tab (now second)
             with gr.Tab("🤖 Ask Questions"):
                 1. **Enter your OpenAI API Key** - Get one from [OpenAI's website](https://platform.openai.com/api-keys)
                 2. **Test your API Key** - Click "🧪 Test API Key" to verify it's working
+                3. **Upload PDF Documents** - Go to the "Upload Documents" tab and upload your PDF files.
                 4. **Ask Questions** - Switch to the "Ask Questions" tab and query your documents
+               """
             )
     return demo

chromadb_query.py CHANGED Viewed

@@ -1,122 +1,4 @@
-# import chromadb
-# import time
-# import chromadb.utils.embedding_functions as embedding_functions
-# import os
-# from openai import OpenAI
-# class ChromaCollection:
-#     def __init__(self, collection_name, db_path, api_key=None):
-#         # Initialize Chroma persistent client and collection name
-#         self.chroma_client = chromadb.PersistentClient(path=db_path)
-#         self.collection_name = collection_name
-#         self.collection = None
-#         # Use provided API key or fall back to environment variable
-#         self.openai_key = api_key or os.getenv("OPENAI_API_KEY")
-#         if not self.openai_key:
-#             raise ValueError("OpenAI API key is required")
-#         self.openai_ef = embedding_functions.OpenAIEmbeddingFunction(
-#             api_key=self.openai_key,
-#             model_name="text-embedding-ada-002"
-#         )
-#         # Initialize OpenAI client
-#         self.openai_client = OpenAI(api_key=self.openai_key)
-#         self._initialize_collection()
-#     def _initialize_collection(self):
-#         """
-#         Initializes the collection if it doesn't exist.
-#         """
-#         try:
-#             self.collection = self.chroma_client.get_collection(
-#                 name=self.collection_name,
-#                 embedding_function=self.openai_ef
-#             )
-#             print(f"Collection '{self.collection_name}' already exists.")
-#         except Exception as e:
-#             # If collection doesn't exist, create a new one
-#             self.collection = self.chroma_client.create_collection(
-#                 name=self.collection_name,
-#                 embedding_function=self.openai_ef
-#             )
-#             print(f"Created new collection '{self.collection_name}'.")
-#     def query_collection(self, query_texts, n_results=1):
-#         """
-#         Queries the collection with the given text and returns the results.
-#         :param query_texts: List of query strings
-#         :param n_results: Number of results to return
-#         :return: Query results
-#         """
-#         try:
-#             results = self.collection.query(
-#                 query_texts=query_texts,  # Chroma will embed this for you
-#                 n_results=n_results  # How many results to return
-#             )
-#             return results
-#         except Exception as e:
-#             print(f"Error querying collection: {e}")
-#             return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
-#     def generate_answer(self, query, results):
-#         """
-#         Takes the query and ChromaDB results and generates an accurate answer using the LLM.
-#         :param query: User's query
-#         :param results: ChromaDB results
-#         :return: Generated answer from LLM
-#         """
-#         # Check if we have any results
-#         if not results['documents'][0]:
-#             return "No relevant documents found to answer your question."
-#         # Prepare the context for LLM by appending the query and results
-#         documents_text = "\n".join(results['documents'][0][:5])  # Use top 5 results
-#         context = f"""Based on the following context from the documents, please answer the user's question accurately and concisely.
-# Context from documents:
-# {documents_text}
-# User's question: {query}
-# Please provide a clear and accurate answer based only on the information provided in the context above."""
-#         try:
-#             # Use the new OpenAI API format
-#             response = self.openai_client.chat.completions.create(
-#                 model="gpt-4o-mini",
-#                 messages=[
-#                     {
-#                         "role": "system",
-#                         "content": "You are a helpful assistant that answers questions based on provided document context. Only use information from the provided context to answer questions."
-#                     },
-#                     {
-#                         "role": "user",
-#                         "content": context
-#                     }
-#                 ],
-#                 max_tokens=500,
-#                 temperature=0.1
-#             )
-#             # Extract and return the answer from the response
-#             return response.choices[0].message.content.strip()
-#         except Exception as e:
-#             return f"Error generating answer: {str(e)}"
-#     def get_collection_count(self):
-#         """
-#         Get the number of documents in the collection.
-#         """
-#         try:
-#             return self.collection.count()
-#         except Exception as e:
-#             print(f"Error getting collection count: {e}")
-#             return 0
 import chromadb
 import time
 import chromadb.utils.embedding_functions as embedding_functions

 import chromadb
 import time
 import chromadb.utils.embedding_functions as embedding_functions

chromadb_upload.py CHANGED Viewed

@@ -1,236 +1,4 @@
-# import chromadb
-# import PyPDF2
-# import time
-# import chromadb.utils.embedding_functions as embedding_functions
-# import os
-# import io
-# class ChromaUploader:
-#     def __init__(self, collection_name, db_path, api_key=None):
-#         # Initialize Chroma persistent client and collection name
-#         self.chroma_client = chromadb.PersistentClient(path=db_path)
-#         self.collection_name = collection_name
-#         self.collection = None
-#         # Use provided API key or fall back to environment variable
-#         self.openai_key = api_key or os.getenv("OPENAI_API_KEY")
-#         if not self.openai_key:
-#             raise ValueError("OpenAI API key is required")
-#         self.openai_ef = embedding_functions.OpenAIEmbeddingFunction(
-#             api_key=self.openai_key,
-#             model_name="text-embedding-ada-002"
-#         )
-#         self._initialize_collection()
-#     def _initialize_collection(self):
-#         """
-#         Initializes the collection if it doesn't exist.
-#         """
-#         try:
-#             self.collection = self.chroma_client.get_collection(
-#                 name=self.collection_name,
-#                 embedding_function=self.openai_ef
-#             )
-#             print(f"Collection '{self.collection_name}' already exists.")
-#         except Exception as e:
-#             # If collection doesn't exist, create a new one
-#             self.collection = self.chroma_client.create_collection(
-#                 name=self.collection_name,
-#                 embedding_function=self.openai_ef
-#             )
-#             print(f"Created new collection '{self.collection_name}'.")
-#     def add_documents(self, documents):
-#         """
-#         Adds documents to the collection, ensuring no duplicate IDs.
-#         :param documents: List of document strings to be added
-#         """
-#         if documents is None or len(documents) == 0:
-#             print("No data collected from the document to add.")
-#             return False
-#         try:
-#             # Create unique IDs for each document chunk
-#             timestamp = int(time.time() * 1000000)  # microseconds for uniqueness
-#             ids = [f"doc_{timestamp}_{i}" for i in range(len(documents))]
-#             # Filter out empty documents
-#             valid_documents = []
-#             valid_ids = []
-#             for i, doc in enumerate(documents):
-#                 if doc and doc.strip() and len(doc.strip()) > 10:  # Only add non-empty docs with some content
-#                     valid_documents.append(doc.strip())
-#                     valid_ids.append(ids[i])
-#             if not valid_documents:
-#                 print("No valid documents to add after filtering.")
-#                 return False
-#             # Add documents to collection in batches to avoid memory issues
-#             batch_size = 100
-#             for i in range(0, len(valid_documents), batch_size):
-#                 batch_docs = valid_documents[i:i + batch_size]
-#                 batch_ids = valid_ids[i:i + batch_size]
-#                 self.collection.add(
-#                     documents=batch_docs,
-#                     ids=batch_ids
-#                 )
-#             print(f"Added {len(valid_documents)} documents to collection '{self.collection_name}'.")
-#             return True
-#         except Exception as e:
-#             print(f"Error adding documents to collection: {e}")
-#             return False
-#     def extract_text_from_pdf_bytes(self, pdf_bytes):
-#         """
-#         Extracts text from a PDF file from bytes (for Gradio uploaded files).
-#         :param pdf_bytes: PDF file as bytes
-#         :return: Extracted text from the PDF and the lines as a list
-#         """
-#         try:
-#             # Create a file-like object from bytes
-#             pdf_file = io.BytesIO(pdf_bytes)
-#             # Create a PDF reader object
-#             pdf_reader = PyPDF2.PdfReader(pdf_file)
-#             # Initialize an empty string to store extracted text
-#             text = ""
-#             # Extract text from each page
-#             for page_num, page in enumerate(pdf_reader.pages):
-#                 try:
-#                     # Extract text from the page
-#                     page_text = page.extract_text()
-#                     # Clean up the extracted text
-#                     cleaned_text = self._clean_extracted_text(page_text)
-#                     if cleaned_text.strip():  # Only add non-empty pages
-#                         # Append to the total text with page marker
-#                         text += f"\n--- Page {page_num + 1} ---\n{cleaned_text}\n"
-#                 except Exception as e:
-#                     print(f"Error extracting text from page {page_num + 1}: {e}")
-#                     continue
-#             if not text.strip():
-#                 return "", []
-#             # Split text into meaningful chunks
-#             chunks = self._split_text_into_chunks(text, max_chunk_size=1000, overlap=100)
-#             return text.strip(), chunks
-#         except Exception as e:
-#             print(f"Error extracting text from PDF: {e}")
-#             return "", []
-#     def extract_text_from_pdf(self, pdf_path):
-#         """
-#         Extracts text from a PDF file using PyPDF2 with improved text extraction.
-#         :param pdf_path: Path to the PDF file
-#         :return: Extracted text from the PDF and the lines as a list
-#         """
-#         try:
-#             # Open the PDF file
-#             with open(pdf_path, 'rb') as file:
-#                 pdf_bytes = file.read()
-#                 return self.extract_text_from_pdf_bytes(pdf_bytes)
-#         except Exception as e:
-#             print(f"Error extracting text from PDF: {e}")
-#             return "", []
-#     def _clean_extracted_text(self, text):
-#         """
-#         Clean up extracted text to improve readability and remove unnecessary whitespace.
-#         :param text: Raw extracted text
-#         :return: Cleaned text
-#         """
-#         if not text:
-#             return ""
-#         # Remove excessive whitespace and clean up
-#         lines = []
-#         for line in text.split('\n'):
-#             cleaned_line = line.strip()
-#             if cleaned_line and len(cleaned_line) > 2:  # Filter out very short lines
-#                 lines.append(cleaned_line)
-#         # Join lines with proper spacing
-#         cleaned_text = ' '.join(lines)
-#         # Remove multiple spaces
-#         while '  ' in cleaned_text:
-#             cleaned_text = cleaned_text.replace('  ', ' ')
-#         return cleaned_text
-#     def _split_text_into_chunks(self, text, max_chunk_size=1000, overlap=100):
-#         """
-#         Split text into overlapping chunks for better context preservation.
-#         :param text: Text to split
-#         :param max_chunk_size: Maximum size of each chunk
-#         :param overlap: Number of characters to overlap between chunks
-#         :return: List of text chunks
-#         """
-#         if not text:
-#             return []
-#         chunks = []
-#         start = 0
-#         while start < len(text):
-#             # Calculate end position
-#             end = start + max_chunk_size
-#             # If we're not at the end of the text, try to end at a sentence boundary
-#             if end < len(text):
-#                 # Look for sentence endings within the last 200 characters
-#                 search_start = max(end - 200, start)
-#                 sentence_endings = ['. ', '! ', '? ', '\n\n']
-#                 best_end = end
-#                 for ending in sentence_endings:
-#                     pos = text.rfind(ending, search_start, end)
-#                     if pos > start:
-#                         best_end = pos + len(ending)
-#                         break
-#                 end = best_end
-#             # Extract chunk
-#             chunk = text[start:end].strip()
-#             if chunk and len(chunk) > 50:  # Only add substantial chunks
-#                 chunks.append(chunk)
-#             # Move start position with overlap
-#             start = max(start + 1, end - overlap)
-#             # Safety check to prevent infinite loops
-#             if start >= len(text):
-#                 break
-#         return chunks
-#     def get_collection_count(self):
-#         """
-#         Get the number of documents in the collection.
-#         """
-#         try:
-#             return self.collection.count()
-#         except Exception as e:
-#             print(f"Error getting collection count: {e}")
-#             return 0
 import chromadb
 import PyPDF2
 import time

 import chromadb
 import PyPDF2
 import time