Spaces:

BluescarfAI
/

CV-Info-Agent

Sleeping

App Files Files Community

dure-waseem commited on Jul 18, 2025

Commit

bf08844

1 Parent(s): a7017a6

initial code

Browse files

Files changed (2) hide show

app.py +0 -353
chromadb_upload.py +0 -283

app.py CHANGED Viewed

@@ -1,356 +1,3 @@
-# import gradio as gr
-# import os
-# import tempfile
-# import shutil
-# from chromadb_query import ChromaCollection
-# from chromadb_upload import ChromaUploader
-# # Global variables to store instances
-# chroma_collection = None
-# chroma_uploader = None
-# current_api_key = None
-# def initialize_chroma_components(api_key):
-#     """Initialize ChromaDB components with the provided API key"""
-#     global chroma_collection, chroma_uploader, current_api_key
-#     if not api_key:
-#         return "❌ Please provide an OpenAI API key"
-#     try:
-#         # Set the API key in environment
-#         os.environ["OPENAI_API_KEY"] = api_key
-#         current_api_key = api_key
-#         # Initialize components
-#         db_path = "./db"
-#         os.makedirs(db_path, exist_ok=True)
-#         collection_name = "my_collection"
-#         chroma_collection = ChromaCollection(collection_name, db_path, api_key)
-#         chroma_uploader = ChromaUploader(collection_name, db_path, api_key)
-#         return "✅ ChromaDB components initialized successfully!"
-#     except Exception as e:
-#         return f"❌ Error initializing components: {str(e)}"
-# def query_documents(api_key, query):
-#     """Query the document collection"""
-#     global chroma_collection
-#     if not api_key:
-#         return "❌ Please provide an OpenAI API key"
-#     if not query.strip():
-#         return "❌ Please enter a query"
-#     # Validate API key format
-#     if not api_key.startswith("sk-") or len(api_key) < 20:
-#         return "❌ Invalid OpenAI API key format. It should start with 'sk-' and be longer than 20 characters."
-#     # Initialize or check if we need to reinitialize
-#     if chroma_collection is None or current_api_key != api_key:
-#         init_msg = initialize_chroma_components(api_key)
-#         if "Error" in init_msg:
-#             return init_msg
-#     try:
-#         # Query the collection with fixed n_results=5
-#         results = chroma_collection.query_collection([query], n_results=5)
-#         if not results['documents'][0]:
-#             return """❌ No documents found in the collection.
-# 📚 **Next steps:**
-# 1. Go to the "📄 Upload Documents" tab
-# 2. Upload some PDF files first
-# 3. Come back and ask your question"""
-#         # Generate answer
-#         answer = chroma_collection.generate_answer(query, results)
-#         # Check if answer indicates an error
-#         if answer.startswith("Error generating answer"):
-#             return f"""❌ Error generating answer: {answer}
-# 🔍 **Troubleshooting:**
-# - Check your internet connection
-# - Verify your OpenAI API key has credits
-# - Try a simpler question
-# - Wait a moment and try again"""
-#         # Count documents for context
-#         try:
-#             doc_count = chroma_collection.get_collection_count()
-#             context_info = f"\n\n---\n*Answer based on {len(results['documents'][0])} relevant chunks from {doc_count} total documents*"
-#         except:
-#             context_info = f"\n\n---\n*Answer based on {len(results['documents'][0])} relevant document chunks*"
-#         return f"🤖 **Answer:**\n\n{answer}{context_info}"
-#     except Exception as e:
-#         error_msg = str(e).lower()
-#         if "connection" in error_msg or "timeout" in error_msg:
-#             return f"""❌ Connection error: {str(e)}
-# 🔍 **Troubleshooting:**
-# - Check your internet connection
-# - Verify OpenAI API is accessible
-# - Try again in a few moments"""
-#         elif "api" in error_msg and "key" in error_msg:
-#             return f"""❌ API key error: {str(e)}
-# 🔑 **Please check:**
-# - Your API key is correct
-# - Your OpenAI account has sufficient credits
-# - The API key has the necessary permissions"""
-#         else:
-#             return f"❌ Error querying documents: {str(e)}"
-# def upload_pdf(api_key, pdf_file):
-#     """Upload and process PDF file"""
-#     global chroma_uploader
-#     if not api_key:
-#         return "❌ Please provide an OpenAI API key"
-#     if pdf_file is None:
-#         return "❌ Please upload a PDF file"
-#     # Validate API key format
-#     if not api_key.startswith("sk-") or len(api_key) < 20:
-#         return "❌ Invalid OpenAI API key format. It should start with 'sk-' and be longer than 20 characters."
-#     # Initialize or check if we need to reinitialize
-#     if chroma_uploader is None or current_api_key != api_key:
-#         init_msg = initialize_chroma_components(api_key)
-#         if "Error" in init_msg:
-#             return init_msg
-#     try:
-#         # Read the PDF file
-#         with open(pdf_file.name, 'rb') as file:
-#             pdf_bytes = file.read()
-#         # Extract text from PDF
-#         pdf_text, pdf_lines = chroma_uploader.extract_text_from_pdf_bytes(pdf_bytes)
-#         if not pdf_text or not pdf_lines:
-#             return "❌ Could not extract text from the PDF file. Make sure it's a text-based PDF (not scanned images)."
-#         # Add documents to ChromaDB with better feedback
-#         print(f"Processing {len(pdf_lines)} document chunks...")
-#         success = chroma_uploader.add_documents(pdf_lines)
-#         if success:
-#             # Get updated count
-#             try:
-#                 count = chroma_uploader.get_collection_count()
-#                 return f"✅ Successfully processed PDF!\n\n📊 Added document chunks from '{os.path.basename(pdf_file.name)}'\n🗃️ Total documents in collection: {count}"
-#             except:
-#                 return f"✅ Successfully processed and added document chunks from '{os.path.basename(pdf_file.name)}'!"
-#         else:
-#             return """❌ Failed to add documents to ChromaDB.
-# 🔍 **Troubleshooting tips:**
-# - Check your internet connection
-# - Verify your OpenAI API key has credits
-# - Try uploading a smaller PDF file
-# - Wait a moment and try again (rate limits)"""
-#     except Exception as e:
-#         error_msg = str(e).lower()
-#         if "connection" in error_msg or "timeout" in error_msg:
-#             return f"""❌ Connection error occurred: {str(e)}
-# 🔍 **Troubleshooting:**
-# - Check your internet connection
-# - Verify OpenAI API is accessible
-# - Try again in a few moments
-# - If on Hugging Face, the service might be temporarily overloaded"""
-#         elif "api" in error_msg and "key" in error_msg:
-#             return f"""❌ API key error: {str(e)}
-# 🔑 **Please check:**
-# - Your API key is correct and starts with 'sk-'
-# - Your OpenAI account has sufficient credits
-# - The API key has the necessary permissions"""
-#         else:
-#             return f"❌ Error processing PDF: {str(e)}"
-# def test_api_key(api_key):
-#     """Test if the API key is working"""
-#     if not api_key:
-#         return "❌ Please provide an OpenAI API key"
-#     if not api_key.startswith("sk-") or len(api_key) < 20:
-#         return "❌ Invalid API key format. OpenAI keys should start with 'sk-' and be longer than 20 characters."
-#     try:
-#         from openai import OpenAI
-#         client = OpenAI(api_key=api_key)
-#         # Test with a simple API call
-#         response = client.chat.completions.create(
-#             model="gpt-4o-mini",
-#             messages=[{"role": "user", "content": "Hello"}],
-#             max_tokens=5
-#         )
-#         return "✅ API key is working! You can now upload documents and ask questions."
-#     except Exception as e:
-#         error_msg = str(e).lower()
-#         if "api" in error_msg and "key" in error_msg:
-#             return f"❌ API key error: Invalid or expired API key. Please check your key and account credits."
-#         elif "quota" in error_msg or "limit" in error_msg:
-#             return f"❌ Quota/rate limit error: Your API key has reached its limit or you're out of credits."
-#         elif "connection" in error_msg or "timeout" in error_msg:
-#             return f"❌ Connection error: Unable to reach OpenAI API. Check your internet connection."
-#         else:
-#             return f"❌ Error testing API key: {str(e)}"
-# # def get_collection_info(api_key):
-# #     """Get information about the current collection"""
-# #     global chroma_uploader
-# #     if not api_key:
-# #         return "❌ Please provide an OpenAI API key"
-# #     if chroma_uploader is None or current_api_key != api_key:
-# #         init_msg = initialize_chroma_components(api_key)
-# #         if "Error" in init_msg:
-# #             return init_msg
-# #     try:
-# #         count = chroma_uploader.get_collection_count()
-# #         if count == 0:
-# #             return """📊 Collection is empty
-# # 🚀 **Get started:**
-# # 1. Upload PDF files using the upload section above
-# # 2. Documents will be processed and stored automatically
-# # 3. Then you can ask questions about your documents"""
-# #         else:
-# #             return f"""📊 Collection Status:
-# # 🗃️ **Total documents:** {count} chunks
-# # ✅ **Status:** Ready for questions
-# # 🔍 **You can now:** Ask questions about your uploaded documents"""
-# #     except Exception as e:
-# #         return f"❌ Error getting collection info: {str(e)}"
-# # Create Gradio interface
-# def create_interface():
-#     with gr.Blocks(title="CV Document Q&A System", theme=gr.themes.Soft()) as demo:
-#         gr.Markdown(
-#             """
-#             # 📚 CV Document Q&A System
-#             Upload the CV and ask questions about its content using AI-powered search and retrieval.
-#             **⚠️ Important:** You need to provide your own OpenAI API key to use this application.
-#             """
-#         )
-#         # API Key input (will be hidden)
-#         with gr.Row():
-#             with gr.Column(scale=4):
-#                 api_key_input = gr.Textbox(
-#                     label="🔑 OpenAI API Key",
-#                     placeholder="Enter your OpenAI API key (sk-...)",
-#                     type="password",
-#                     info="Your API key is not stored and is only used for this session"
-#                 )
-#             with gr.Column(scale=1):
-#                 test_key_button = gr.Button("🧪 Test API Key", variant="secondary")
-#         api_test_output = gr.Markdown(label="API Key Status")
-#         test_key_button.click(
-#             test_api_key,
-#             inputs=[api_key_input],
-#             outputs=api_test_output
-#         )
-#         with gr.Tabs():
-#             # Upload Tab (now first)
-#             with gr.Tab("📄 Upload Documents"):
-#                 gr.Markdown("### Upload PDF documents to your knowledge base")
-#                 pdf_upload = gr.File(
-#                     label="Upload PDF File",
-#                     file_types=[".pdf"],
-#                     type="filepath"
-#                 )
-#                 upload_button = gr.Button("📁 Process PDF", variant="primary")
-#                 upload_output = gr.Markdown(label="Upload Status")
-#                 upload_button.click(
-#                     upload_pdf,
-#                     inputs=[api_key_input, pdf_upload],
-#                     outputs=upload_output
-#                 )
-#                 # Collection info
-#                 # info_button = gr.Button("📊 Check Collection Status")
-#                 # info_output = gr.Markdown(label="Collection Information")
-#                 # info_button.click(
-#                 #     get_collection_info,
-#                 #     inputs=[api_key_input],
-#                 #     outputs=info_output
-#                 # )
-#             # Q&A Tab (now second)
-#             with gr.Tab("🤖 Ask Questions"):
-#                 gr.Markdown("### Ask questions about your uploaded documents")
-#                 query_input = gr.Textbox(
-#                     label="Your Question",
-#                     placeholder="Ask me anything about your documents...",
-#                     lines=3
-#                 )
-#                 query_button = gr.Button("🔍 Get Answer", variant="primary")
-#                 query_output = gr.Markdown(label="Answer")
-#                 query_button.click(
-#                     query_documents,
-#                     inputs=[api_key_input, query_input],
-#                     outputs=query_output
-#                 )
-#         # Instructions
-#         with gr.Accordion("📖 How to Use & Troubleshooting", open=False):
-#             gr.Markdown(
-#                 """
-#                 ### Instructions:
-#                 1. **Enter your OpenAI API Key** - Get one from [OpenAI's website](https://platform.openai.com/api-keys)
-#                 2. **Test your API Key** - Click "🧪 Test API Key" to verify it's working
-#                 3. **Upload PDF Documents** - Go to the "Upload Documents" tab and upload your PDF files.
-#                 4. **Ask Questions** - Switch to the "Ask Questions" tab and query your documents
-#                """
-#             )
-#     return demo
-# # Launch the application
-# if __name__ == "__main__":
-#     demo = create_interface()
-#     demo.launch(
-#         server_name="0.0.0.0",
-#         server_port=7860,
-#         share=False  # Set to True to create a public link
-#     )
 import gradio as gr
 import os
 import tempfile

 import gradio as gr
 import os
 import tempfile

chromadb_upload.py CHANGED Viewed

@@ -1,287 +1,4 @@
-# import chromadb
-# import PyPDF2
-# import time
-# import chromadb.utils.embedding_functions as embedding_functions
-# import os
-# import io
-# class ChromaUploader:
-#     def __init__(self, collection_name, db_path, api_key=None):
-#         # Initialize Chroma persistent client and collection name
-#         self.chroma_client = chromadb.PersistentClient(path=db_path)
-#         self.collection_name = collection_name
-#         self.collection = None
-#         # Use provided API key or fall back to environment variable
-#         self.openai_key = api_key or os.getenv("OPENAI_API_KEY")
-#         if not self.openai_key:
-#             raise ValueError("OpenAI API key is required")
-#         self.openai_ef = embedding_functions.OpenAIEmbeddingFunction(
-#             api_key=self.openai_key,
-#             model_name="text-embedding-ada-002"
-#         )
-#         self._initialize_collection()
-#     def _initialize_collection(self):
-#         """
-#         Initializes the collection if it doesn't exist.
-#         """
-#         try:
-#             self.collection = self.chroma_client.get_collection(
-#                 name=self.collection_name,
-#                 embedding_function=self.openai_ef
-#             )
-#             print(f"Collection '{self.collection_name}' already exists.")
-#         except Exception as e:
-#             # If collection doesn't exist, create a new one
-#             self.collection = self.chroma_client.create_collection(
-#                 name=self.collection_name,
-#                 embedding_function=self.openai_ef
-#             )
-#             print(f"Created new collection '{self.collection_name}'.")
-#     def add_documents(self, documents):
-#         """
-#         Adds documents to the collection with retry mechanism and better error handling.
-#         :param documents: List of document strings to be added
-#         """
-#         if documents is None or len(documents) == 0:
-#             print("No data collected from the document to add.")
-#             return False
-#         try:
-#             # Create unique IDs for each document chunk
-#             timestamp = int(time.time() * 1000000)  # microseconds for uniqueness
-#             ids = [f"doc_{timestamp}_{i}" for i in range(len(documents))]
-#             # Filter out empty documents
-#             valid_documents = []
-#             valid_ids = []
-#             for i, doc in enumerate(documents):
-#                 if doc and doc.strip() and len(doc.strip()) > 10:  # Only add non-empty docs with some content
-#                     valid_documents.append(doc.strip())
-#                     valid_ids.append(ids[i])
-#             if not valid_documents:
-#                 print("No valid documents to add after filtering.")
-#                 return False
-#             print(f"Attempting to add {len(valid_documents)} documents to collection...")
-#             # Add documents to collection in smaller batches with retry
-#             batch_size = 20  # Reduced batch size to avoid connection issues
-#             total_added = 0
-#             for i in range(0, len(valid_documents), batch_size):
-#                 batch_docs = valid_documents[i:i + batch_size]
-#                 batch_ids = valid_ids[i:i + batch_size]
-#                 success = self._add_batch_with_retry(batch_docs, batch_ids, max_retries=3)
-#                 if success:
-#                     total_added += len(batch_docs)
-#                     print(f"Successfully added batch {i//batch_size + 1}, total: {total_added}/{len(valid_documents)}")
-#                 else:
-#                     print(f"Failed to add batch {i//batch_size + 1} after retries")
-#                     # Continue with next batch instead of failing completely
-#             if total_added > 0:
-#                 print(f"Successfully added {total_added} out of {len(valid_documents)} documents to collection '{self.collection_name}'.")
-#                 return True
-#             else:
-#                 print("Failed to add any documents to the collection.")
-#                 return False
-#         except Exception as e:
-#             print(f"Error in add_documents: {e}")
-#             return False
-#     def _add_batch_with_retry(self, batch_docs, batch_ids, max_retries=3):
-#         """
-#         Add a batch of documents with retry mechanism
-#         """
-#         import time
-#         for attempt in range(max_retries):
-#             try:
-#                 print(f"Attempt {attempt + 1}/{max_retries} for batch of {len(batch_docs)} documents...")
-#                 self.collection.add(
-#                     documents=batch_docs,
-#                     ids=batch_ids
-#                 )
-#                 return True
-#             except Exception as e:
-#                 error_msg = str(e).lower()
-#                 print(f"Attempt {attempt + 1} failed: {e}")
-#                 if "connection" in error_msg or "timeout" in error_msg or "rate" in error_msg:
-#                     # Network or rate limit issue - wait before retry
-#                     wait_time = (attempt + 1) * 2  # Exponential backoff
-#                     print(f"Connection/rate limit issue detected. Waiting {wait_time} seconds before retry...")
-#                     time.sleep(wait_time)
-#                 elif "api" in error_msg and "key" in error_msg:
-#                     # API key issue - no point in retrying
-#                     print("API key issue detected. Cannot retry.")
-#                     return False
-#                 else:
-#                     # Other error - short wait before retry
-#                     time.sleep(1)
-#                 if attempt == max_retries - 1:
-#                     print(f"All {max_retries} attempts failed for this batch.")
-#                     return False
-#         return False
-#     def extract_text_from_pdf_bytes(self, pdf_bytes):
-#         """
-#         Extracts text from a PDF file from bytes (for Gradio uploaded files).
-#         :param pdf_bytes: PDF file as bytes
-#         :return: Extracted text from the PDF and the lines as a list
-#         """
-#         try:
-#             # Create a file-like object from bytes
-#             pdf_file = io.BytesIO(pdf_bytes)
-#             # Create a PDF reader object
-#             pdf_reader = PyPDF2.PdfReader(pdf_file)
-#             # Initialize an empty string to store extracted text
-#             text = ""
-#             # Extract text from each page
-#             for page_num, page in enumerate(pdf_reader.pages):
-#                 try:
-#                     # Extract text from the page
-#                     page_text = page.extract_text()
-#                     # Clean up the extracted text
-#                     cleaned_text = self._clean_extracted_text(page_text)
-#                     if cleaned_text.strip():  # Only add non-empty pages
-#                         # Append to the total text with page marker
-#                         text += f"\n--- Page {page_num + 1} ---\n{cleaned_text}\n"
-#                 except Exception as e:
-#                     print(f"Error extracting text from page {page_num + 1}: {e}")
-#                     continue
-#             if not text.strip():
-#                 return "", []
-#             # Split text into meaningful chunks
-#             chunks = self._split_text_into_chunks(text, max_chunk_size=1000, overlap=100)
-#             return text.strip(), chunks
-#         except Exception as e:
-#             print(f"Error extracting text from PDF: {e}")
-#             return "", []
-#     def extract_text_from_pdf(self, pdf_path):
-#         """
-#         Extracts text from a PDF file using PyPDF2 with improved text extraction.
-#         :param pdf_path: Path to the PDF file
-#         :return: Extracted text from the PDF and the lines as a list
-#         """
-#         try:
-#             # Open the PDF file
-#             with open(pdf_path, 'rb') as file:
-#                 pdf_bytes = file.read()
-#                 return self.extract_text_from_pdf_bytes(pdf_bytes)
-#         except Exception as e:
-#             print(f"Error extracting text from PDF: {e}")
-#             return "", []
-#     def _clean_extracted_text(self, text):
-#         """
-#         Clean up extracted text to improve readability and remove unnecessary whitespace.
-#         :param text: Raw extracted text
-#         :return: Cleaned text
-#         """
-#         if not text:
-#             return ""
-#         # Remove excessive whitespace and clean up
-#         lines = []
-#         for line in text.split('\n'):
-#             cleaned_line = line.strip()
-#             if cleaned_line and len(cleaned_line) > 2:  # Filter out very short lines
-#                 lines.append(cleaned_line)
-#         # Join lines with proper spacing
-#         cleaned_text = ' '.join(lines)
-#         # Remove multiple spaces
-#         while '  ' in cleaned_text:
-#             cleaned_text = cleaned_text.replace('  ', ' ')
-#         return cleaned_text
-#     def _split_text_into_chunks(self, text, max_chunk_size=1000, overlap=100):
-#         """
-#         Split text into overlapping chunks for better context preservation.
-#         :param text: Text to split
-#         :param max_chunk_size: Maximum size of each chunk
-#         :param overlap: Number of characters to overlap between chunks
-#         :return: List of text chunks
-#         """
-#         if not text:
-#             return []
-#         chunks = []
-#         start = 0
-#         while start < len(text):
-#             # Calculate end position
-#             end = start + max_chunk_size
-#             # If we're not at the end of the text, try to end at a sentence boundary
-#             if end < len(text):
-#                 # Look for sentence endings within the last 200 characters
-#                 search_start = max(end - 200, start)
-#                 sentence_endings = ['. ', '! ', '? ', '\n\n']
-#                 best_end = end
-#                 for ending in sentence_endings:
-#                     pos = text.rfind(ending, search_start, end)
-#                     if pos > start:
-#                         best_end = pos + len(ending)
-#                         break
-#                 end = best_end
-#             # Extract chunk
-#             chunk = text[start:end].strip()
-#             if chunk and len(chunk) > 50:  # Only add substantial chunks
-#                 chunks.append(chunk)
-#             # Move start position with overlap
-#             start = max(start + 1, end - overlap)
-#             # Safety check to prevent infinite loops
-#             if start >= len(text):
-#                 break
-#         return chunks
-#     def get_collection_count(self):
-#         """
-#         Get the number of documents in the collection.
-#         """
-#         try:
-#             return self.collection.count()
-#         except Exception as e:
-#             print(f"Error getting collection count: {e}")
-#             return 0
 import chromadb


1



























































































































































































































































































2
3
4	import chromadb