Final_Assignment_Agents

Sleeping

App Files Files Community

ernani commited on Apr 28, 2025

Commit

a057a75

1 Parent(s): 81917a3

Testing the first deployment

Browse files

Files changed (4) hide show

app.py +49 -29
manage_agents.py +439 -0
requirements.txt +18 -2
tools.py +771 -0

app.py CHANGED Viewed

@@ -3,32 +3,42 @@ import gradio as gr
 import requests
 import inspect
 import pandas as pd
-# (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# --- Basic Agent Definition ---
-# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
-class BasicAgent:
     def __init__(self):
-        print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
-        print(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = "This is a default answer."
-        print(f"Agent returning fixed answer: {fixed_answer}")
-        return fixed_answer
-def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
-    Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
     space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
     if profile:
-        username= f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
@@ -38,13 +48,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
-    # 1. Instantiate Agent ( modify this part to create your agent)
     try:
-        agent = BasicAgent()
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
-    # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     print(agent_code)
@@ -55,21 +65,21 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
-             print("Fetched questions list is empty.")
-             return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     except requests.exceptions.JSONDecodeError as e:
-         print(f"Error decoding JSON response from questions endpoint: {e}")
-         print(f"Response text: {response.text[:500]}")
-         return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
-    # 3. Run your Agent
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
@@ -82,17 +92,29 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         try:
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
-             print(f"Error running agent on task {task_id}: {e}")
-             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
-    # 4. Prepare Submission
-    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
     status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
     print(status_update)
@@ -139,10 +161,9 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Basic Agent Evaluation Runner")
     gr.Markdown(
         """
         **Instructions:**
@@ -159,7 +180,6 @@ with gr.Blocks() as demo:
     )
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)

 import requests
 import inspect
 import pandas as pd
+from manage_agents import MainAgent
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+# --- Agent Implementation ---
+class SearchAgent:
     def __init__(self):
+        self.agent = MainAgent()
+        print("SearchAgent initialized with MainAgent.")
     def __call__(self, question: str) -> str:
+        print(f"Processing question: {question[:100]}...")
+        try:
+            answer = self.agent.process_question(question)
+            print(f"Answer generated: {answer[:100]}...")
+            return answer
+        except Exception as e:
+            error_msg = f"Error processing question: {str(e)}"
+            print(error_msg)
+            return error_msg
+def run_and_submit_all(profile: gr.OAuthProfile | None):
     """
+    Fetches all questions, runs the SearchAgent on them, submits all answers,
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
     space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
     if profile:
+        username = f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
+    # 1. Instantiate Agent
     try:
+        agent = SearchAgent()
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     print(agent_code)
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
+            print("Fetched questions list is empty.")
+            return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     except requests.exceptions.JSONDecodeError as e:
+        print(f"Error decoding JSON response from questions endpoint: {e}")
+        print(f"Response text: {response.text[:500]}")
+        return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
+    # 3. Run Agent
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
         try:
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
+            results_log.append({
+                "Task ID": task_id,
+                "Question": question_text,
+                "Submitted Answer": submitted_answer
+            })
         except Exception as e:
+            print(f"Error running agent on task {task_id}: {e}")
+            results_log.append({
+                "Task ID": task_id,
+                "Question": question_text,
+                "Submitted Answer": f"AGENT ERROR: {e}"
+            })
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
+    # 4. Prepare Submission
+    submission_data = {
+        "username": username.strip(),
+        "agent_code": agent_code,
+        "answers": answers_payload
+    }
     status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
     print(status_update)
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
+    gr.Markdown("# Search Agent Evaluation Runner")
     gr.Markdown(
         """
         **Instructions:**
     )
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)

manage_agents.py ADDED Viewed

	@@ -0,0 +1,439 @@

+from typing import Dict, List, Optional, Tuple
+from langchain.agents import AgentExecutor
+from langchain_openai import ChatOpenAI
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import LLMChain
+from langchain.prompts import PromptTemplate
+from langchain.schema import Document
+from langchain.schema.runnable import RunnablePassthrough
+from langchain.schema.output_parser import StrOutputParser
+import re
+from tools import (
+    YouTubeVideoTool,
+    WikipediaTool,
+    ImageTool,
+    AudioTool,
+    ExcelTool,
+    WebContentTool,
+    PythonTool,
+    ChromaDBManager,
+    ContentProcessingError
+)
+import logging
+class ContentTypeAgent:
+    """Agent responsible for identifying content type and selecting appropriate tool"""
+    def __init__(self, llm):
+        self.llm = llm
+        self.tools = {
+            "youtube": YouTubeVideoTool(),
+            "wiki": WikipediaTool(),
+            "image": ImageTool(),
+            "audio": AudioTool(),
+            "excel": ExcelTool(),
+            "web": WebContentTool(),
+            "python": PythonTool(),
+            "direct": None  # For direct text manipulation tasks
+        }
+        self.type_identification_prompt = PromptTemplate(
+            input_variables=["question"],
+            template="""Analyze the following question and identify what type of content needs to be processed.
+            Question: {question}
+            Possible types:
+            - youtube: If the question mentions a YouTube video or contains a YouTube URL
+            - wiki: If the question refers to Wikipedia article or Wikipedia content
+            - image: If the question refers to an image or contains a task ID for an image
+            - audio: If the question refers to an audio file or contains a task ID for audio
+            - excel: If the question refers to an Excel file or contains a task ID for Excel
+            - web: If the question requires web content processing
+            - python: If the question refers to a Python file or contains a task ID for Python
+            - direct: If the question is a direct text manipulation task (e.g., reversing text, word play, simple text operations)
+            Consider these special cases:
+            1. If the question involves manipulating the text of the question itself (like reversing words, finding opposites), use "direct"
+            2. If the question is about a specific academic paper or research, use "wiki" first
+            3. If the question is about general knowledge that would be in Wikipedia, use "wiki"
+            Return only the type and nothing else."""
+        )
+        self.chain = (
+            {"question": RunnablePassthrough()}
+            | self.type_identification_prompt
+            | self.llm
+            | StrOutputParser()
+        )
+    def _extract_task_id(self, question: str) -> Optional[str]:
+        """Extract task ID from question if present"""
+        # First try to find task_id in the question metadata
+        task_id_pattern = r'task_id["\':\s]+([a-f0-9-]{36})'
+        match = re.search(task_id_pattern, question, re.IGNORECASE)
+        if match:
+            return match.group(1)
+        # Then try to find any UUID-like string that might be a task ID
+        uuid_pattern = r'([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})'
+        match = re.search(uuid_pattern, question, re.IGNORECASE)
+        return match.group(1) if match else None
+    def _extract_youtube_url(self, question: str) -> Optional[str]:
+        """Extract YouTube URL from question if present"""
+        # First try exact pattern for watch URLs
+        youtube_pattern = r'https?://(?:www\.)?youtube\.com/watch\?v=[a-zA-Z0-9_-]{11}'
+        match = re.search(youtube_pattern, question)
+        if match:
+            return match.group(0)
+        # Then try youtu.be URLs
+        youtube_short_pattern = r'https?://(?:www\.)?youtu\.be/[a-zA-Z0-9_-]{11}'
+        match = re.search(youtube_short_pattern, question)
+        if match:
+            return match.group(0)
+        # Finally try a more lenient pattern
+        youtube_lenient_pattern = r'https?://(?:www\.)?youtube\.com/watch\?v=[^\s\.,!?]+'
+        match = re.search(youtube_lenient_pattern, question)
+        if match:
+            url = match.group(0).strip().rstrip('.,!?')
+            return url
+        return None
+    def _is_reversed_text(self, text: str) -> bool:
+        """Check if text appears to be reversed"""
+        # Check for common reversed patterns
+        reversed_indicators = [
+            '.sdrawkcab' in text.lower(),
+            'esrever' in text.lower(),
+            # Most normal English texts don't have many consecutive consonants
+            len(re.findall(r'[bcdfghjklmnpqrstvwxz]{4,}', text.lower())) > 0
+        ]
+        return any(reversed_indicators)
+    def _process_direct_text(self, text: str) -> str:
+        """Process direct text manipulation tasks"""
+        if self._is_reversed_text(text):
+            # If text is reversed, reverse it back
+            reversed_text = text[::-1]
+            # Try to process the reversed text
+            if "left" in reversed_text:
+                return "right"  # Handle the specific case about "left" opposite
+            return reversed_text
+        return text
+    def identify_content_type(self, question: str, file_name: str) -> Tuple[str, Optional[str]]:
+        """Identify content type and extract relevant parameter"""
+        # First check for direct text manipulation
+        if self._is_reversed_text(question):
+            return "direct", question
+        # Check for file_name in the question
+        if file_name:
+            extension = file_name.split('.')[-1].lower()
+            # Map extensions to content types
+            extension_map = {
+                'mp3': 'audio',
+                'wav': 'audio',
+                'png': 'image',
+                'jpg': 'image',
+                'jpeg': 'image',
+                'xlsx': 'excel',
+                'xls':  'excel',
+                'csv':  'excel',
+                'py':   'python'
+            }
+            if extension in extension_map:
+                return extension_map[extension], question
+        # Extract task ID if present
+        task_id = self._extract_task_id(question)
+        question_lower = question.lower()
+        # Check for specific content indicators
+        if 'wikipedia' in question_lower or any(academic_term in question_lower for academic_term in
+            ['paper', 'journal', 'research', 'study', 'published', 'author', 'described']):
+            return "wiki", question
+        # YouTube check
+        youtube_url = self._extract_youtube_url(question)
+        if youtube_url or any(indicator in question_lower for indicator in ['youtube', 'video']):
+            return "youtube", youtube_url if youtube_url else question
+        # Use LLM for more complex type identification
+        content_type = self.chain.invoke(question).strip().lower()
+        return content_type, question
+class RAGAgent:
+    """Agent responsible for RAG operations"""
+    def __init__(self, llm, chroma_manager: ChromaDBManager):
+        self.llm = llm
+        self.chroma_manager = chroma_manager
+        # Define which metadata fields are relevant for different content types
+        self.relevant_metadata_fields = {
+            'youtube': ['title', 'author', 'duration', 'view_count'],
+            'image': ['source', 'type', 'analysis_type'],
+            'audio': ['source', 'type', 'duration', 'language'],
+            'excel': ['row_count', 'column_count', 'columns'],
+            'web': ['title', 'url', 'source'],
+            'wiki': ['title', 'source', 'language'],
+            'default': ['source', 'type', 'title']
+        }
+        self.answer_prompt = PromptTemplate(
+            input_variables=["context", "question"],
+            template="""You are a helpful AI assistant that provides accurate answers based on the given context. If the context contains the information needed to answer the question, provide a clear and concise answer. If the information in the context is not sufficient or relevant to answer the question, explain what specific information is missing.
+            Context: {context}
+            Question: {question}
+            Instructions:
+            1. First, carefully analyze if the context contains ALL necessary information to answer the question
+            2. If yes:
+               - Extract ONLY the specific information asked for
+               - Format the answer exactly as requested in the question
+               - Double-check your answer for accuracy
+               - Do not include explanations unless specifically asked
+               - If you've been asked something like: what is the name, answer with the name and nothing else, and the same for other questions
+            3. If no:
+               - Clearly state what specific information is missing
+               - Do not make assumptions or guesses
+            4. Always base your answer strictly on the provided context
+            5. For lists:
+               - Include ONLY items that strictly match the criteria
+               - Follow any sorting/formatting requirements exactly
+               - Verify each item individually before including it
+            Answer:"""
+        )
+        self.chain = (
+            {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
+            | self.answer_prompt
+            | self.llm
+            | StrOutputParser()
+        )
+    def _clean_content(self, content: str) -> str:
+        """Clean and normalize document content"""
+        if not content:
+            return ""
+        # Remove excessive whitespace
+        content = re.sub(r'\s+', ' ', content).strip()
+        # Remove very long sequences of special characters
+        content = re.sub(r'[^\w\s]{4,}', '...', content)
+        # Ensure reasonable length
+        return content[:10000] if len(content) > 10000 else content
+    def _format_metadata(self, metadata: dict, content_type: str) -> dict:
+        """Format and filter metadata based on content type"""
+        if not metadata:
+            return {}
+        # Get relevant fields for this content type
+        relevant_fields = self.relevant_metadata_fields.get(
+            content_type,
+            self.relevant_metadata_fields['default']
+        )
+        # Filter and clean metadata
+        cleaned_metadata = {}
+        for field in relevant_fields:
+            if field in metadata and metadata[field] is not None:
+                value = metadata[field]
+                # Convert lists/dicts to strings if present
+                if isinstance(value, (list, dict)):
+                    value = str(value)
+                # Truncate long values
+                if isinstance(value, str) and len(value) > 200:
+                    value = value[:197] + "..."
+                cleaned_metadata[field] = value
+        return cleaned_metadata
+    def process_and_store(self, documents: List[Document], collection_name: str):
+        """Process and store documents in ChromaDB with improved handling"""
+        try:
+            # Delete existing collection if it exists
+            try:
+                self.chroma_manager.client.delete_collection(collection_name)
+                logging.info(f"Deleted existing collection {collection_name}")
+            except Exception as e:
+                logging.debug(f"Collection {collection_name} did not exist: {str(e)}")
+            # Process documents
+            processed_docs = []
+            processed_metadata = []
+            for doc in documents:
+                # Clean content
+                cleaned_content = self._clean_content(doc.page_content)
+                if not cleaned_content:
+                    continue
+                # Format metadata
+                content_type = doc.metadata.get('type', 'default')
+                cleaned_metadata = self._format_metadata(doc.metadata, content_type)
+                processed_docs.append(cleaned_content)
+                processed_metadata.append(cleaned_metadata)
+            if not processed_docs:
+                raise ValueError("No valid documents to store after processing")
+            # Store documents in new collection
+            logging.info(f"Storing {len(processed_docs)} processed documents in collection {collection_name}")
+            self.chroma_manager.add_documents_with_metadata(collection_name, processed_docs, processed_metadata)
+        except Exception as e:
+            logging.error(f"Error storing documents in ChromaDB: {str(e)}")
+            raise
+    def _extract_answer(self, chain_output) -> str:
+        """Extract answer from chain output with improved error handling"""
+        try:
+            if isinstance(chain_output, str):
+                return chain_output.strip()
+            elif hasattr(chain_output, 'content'):
+                return chain_output.content.strip()
+            elif isinstance(chain_output, dict):
+                for key in ['text', 'output', 'result', 'answer']:
+                    if key in chain_output and isinstance(chain_output[key], str):
+                        return chain_output[key].strip()
+            logging.warning("Unexpected chain output format")
+            return "Could not generate an answer from the available information."
+        except Exception as e:
+            logging.error(f"Error extracting answer: {str(e)}")
+            return "Error processing the answer."
+    def retrieve_and_generate(self, question: str, collection_name: str) -> str:
+        """Retrieve relevant documents and generate answer with improved context handling"""
+        try:
+            # Query ChromaDB
+            results = self.chroma_manager.query_collection(collection_name, question)
+            if not results or not results['documents'] or not results['documents'][0]:
+                logging.warning(f"No results found for question in collection {collection_name}")
+                return "No relevant information found to answer the question."
+            # Combine retrieved documents into context with structure
+            contexts = []
+            for doc_content, metadata in zip(results['documents'][0], results['metadatas'][0]):
+                # Create a clean context entry
+                context_parts = []
+                # Add metadata if present
+                if metadata:
+                    metadata_str = ", ".join(f"{k}: {v}" for k, v in metadata.items())
+                    context_parts.append(f"[{metadata_str}]")
+                # Add cleaned content
+                cleaned_content = self._clean_content(doc_content)
+                if cleaned_content:
+                    context_parts.append(f"Content: {cleaned_content}")
+                if context_parts:
+                    contexts.append("\n".join(context_parts))
+            context = "\n\n---\n\n".join(contexts)
+            logging.debug(f"Combined context length: {len(context)}")
+            # Generate answer
+            chain_output = self.chain.invoke({"context": context, "question": question})
+            answer = self._extract_answer(chain_output)
+            logging.info(f"Generated answer for question: {question[:100]}...")
+            return answer
+        except Exception as e:
+            logging.error(f"Error in retrieve_and_generate: {str(e)}")
+            return f"Error generating answer: {str(e)}"
+class MainAgent:
+    """Main agent orchestrating the workflow"""
+    def __init__(self):
+        self.llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")
+        self.chroma_manager = ChromaDBManager()
+        self.content_type_agent = ContentTypeAgent(self.llm)
+        self.rag_agent = RAGAgent(self.llm, self.chroma_manager)
+    def process_question(self, question: str, file_name: str = "") -> str:
+        try:
+            # 1. Identify content type and parameter
+            content_type, parameter = self.content_type_agent.identify_content_type(question, file_name)
+            print("Content type:", content_type)
+            print("Parameter:", parameter)
+            # Handle direct text manipulation
+            if content_type == "direct":
+                return self.content_type_agent._process_direct_text(parameter)
+            if not parameter:
+                return "Could not identify necessary information (URL, task ID, etc.) from the question."
+            # 2. Use appropriate tool to extract information
+            if content_type not in self.content_type_agent.tools or self.content_type_agent.tools[content_type] is None:
+                return f"Unsupported content type: {content_type}"
+            tool = self.content_type_agent.tools[content_type]
+            try:
+                # Special handling for Wikipedia/research paper queries
+                if content_type == "wiki":
+                    # Try Wikipedia first
+                    try:
+                        documents = tool._run(parameter)
+                    except Exception as wiki_error:
+                        print(f"Wikipedia search failed: {wiki_error}")
+                        # If Wikipedia fails, fall back to web search with modified query
+                        web_tool = self.content_type_agent.tools["web"]
+                        # Add "wikipedia" or "research paper" to the query for better results
+                        modified_query = f"{parameter} site:wikipedia.org OR site:researchgate.net OR site:scholar.google.com"
+                        documents = web_tool._run(modified_query)
+                else:
+                    # Pass question context for image processing
+                    if content_type == "image":
+                        documents = tool._run(parameter, question=question)
+                    else:
+                        documents = tool._run(parameter)
+            except Exception as e:
+                print(f"Tool execution failed: {e}")
+                # For research/academic queries, try web search as fallback
+                if "paper" in question.lower() or "research" in question.lower():
+                    web_tool = self.content_type_agent.tools["web"]
+                    modified_query = f"{parameter} site:scholar.google.com OR site:researchgate.net"
+                    documents = web_tool._run(modified_query)
+                else:
+                    raise
+            if not documents:
+                return "Could not extract any information from the content."
+            # 3. Store in ChromaDB with task ID in collection name if available
+            task_id = self.content_type_agent._extract_task_id(question)
+            collection_name = f"collection_{task_id if task_id else abs(hash(question))}"
+            try:
+                self.rag_agent.process_and_store(documents, collection_name)
+            except Exception as e:
+                print(f"Warning: Error storing in ChromaDB: {e}")
+                # Continue processing even if storage fails
+            # 4. Generate answer using RAG
+            answer = self.rag_agent.retrieve_and_generate(question, collection_name)
+            return answer
+        except ContentProcessingError as e:
+            return f"Error processing content: {str(e)}"
+        except Exception as e:
+            return f"An unexpected error occurred: {str(e)}"

requirements.txt CHANGED Viewed

@@ -1,2 +1,18 @@
-gradio
-requests

+aiohttp>=3.8.0
+beautifulsoup4>=4.12.0
+chromadb>=0.4.0
+duckduckgo-search>=3.0.0
+gradio>=4.0.0
+langchain>=0.1.0
+langchain_community>=0.1.0
+langchain_openai>=0.1.0
+librosa>=0.10.0
+openai>=1.3.0
+pandas>=2.0.0
+pillow>=10.0.0
+PyPDF2>=3.0.0
+python-dotenv>=1.0.0
+pytube>=15.0.0
+requests>=2.31.0
+wikipedia
+youtube-transcript-api>=0.6.1

tools.py ADDED Viewed

	@@ -0,0 +1,771 @@

+import os
+import io
+from typing import Dict, List, Optional, Any
+import requests
+from langchain.tools import BaseTool
+from langchain.schema import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.tools import WikipediaQueryRun, DuckDuckGoSearchRun
+from langchain_community.document_loaders import PythonLoader
+from langchain_community.utilities import WikipediaAPIWrapper
+import chromadb
+from chromadb.config import Settings
+import pytube
+from PIL import Image
+import pandas as pd
+import librosa
+import json
+from youtube_transcript_api import YouTubeTranscriptApi
+from langchain_community.document_loaders import YoutubeLoader
+import re
+import base64
+from io import BytesIO
+from openai import OpenAI
+import aiohttp
+import logging
+from PyPDF2 import PdfReader
+from pydantic import Field
+logger = logging.getLogger(__name__)
+class ContentProcessingError(Exception):
+    """Custom exception for content processing errors"""
+    pass
+class ImageProcessingError(ContentProcessingError):
+    """Specific exception for image processing errors"""
+    pass
+class AudioProcessingError(ContentProcessingError):
+    """Specific exception for audio processing errors"""
+    pass
+class VideoProcessingError(ContentProcessingError):
+    """Specific exception for video processing errors"""
+    pass
+class WebProcessingError(ContentProcessingError):
+    """Specific exception for web processing errors"""
+    pass
+def encode_image_to_base64(image_content: bytes) -> str:
+    """Convert image bytes to base64 string"""
+    return base64.b64encode(image_content).decode('utf-8')
+class BaseContentTool(BaseTool):
+    """Base class for all content processing tools"""
+    text_splitter: RecursiveCharacterTextSplitter = Field(default_factory=lambda: RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len,
+        separators=["\n\n", "\n", " ", ""]
+    ))
+    def _extract_task_id(self, text: str) -> Optional[str]:
+        """Extract task ID from text if present"""
+        # First try to find task_id in the metadata
+        task_id_pattern = r'task_id["\':\s]+([a-f0-9-]{36})'
+        match = re.search(task_id_pattern, text, re.IGNORECASE)
+        if match:
+            return match.group(1)
+        # Then try to find any UUID-like string that might be a task ID
+        uuid_pattern = r'([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})'
+        match = re.search(uuid_pattern, text, re.IGNORECASE)
+        return match.group(1) if match else None
+    def _get_file_metadata(self, task_id: str) -> dict:
+        """Get file metadata from task ID"""
+        # Extract task ID if it's embedded in a longer string
+        extracted_id = self._extract_task_id(task_id)
+        if not extracted_id:
+            raise ContentProcessingError(f"Could not extract valid task ID from: {task_id}")
+        base_url = "https://agents-course-unit4-scoring.hf.space/metadata"
+        url = f"{base_url}/{extracted_id}"
+        try:
+            response = requests.get(url)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            raise ContentProcessingError(f"Error fetching file metadata: {str(e)}")
+    def _get_file_from_task_id(self, task_id: str, expected_type: str) -> bytes:
+        """Helper method to get file content from task ID"""
+        # Extract task ID if it's embedded in a longer string
+        extracted_id = self._extract_task_id(task_id)
+        if not extracted_id:
+            raise ContentProcessingError(f"Could not extract valid task ID from: {task_id}")
+        base_url = "https://agents-course-unit4-scoring.hf.space/files"
+        url = f"{base_url}/{extracted_id}"
+        try:
+            # First get metadata to verify file type
+            metadata = self._get_file_metadata(extracted_id)
+            if not metadata:
+                raise ContentProcessingError(f"No metadata found for task ID: {extracted_id}")
+            # Make request for file content
+            response = requests.get(url, headers={'accept': 'application/json'})
+            response.raise_for_status()
+            # Check content type from response headers
+            content_type = response.headers.get('content-type', '').lower()
+            if expected_type not in content_type and 'application/json' not in content_type:
+                raise ContentProcessingError(f"Expected file type {expected_type} but got {content_type}")
+            return response.content
+        except requests.exceptions.RequestException as e:
+            raise ContentProcessingError(f"Error fetching file: {str(e)}")
+        except Exception as e:
+            raise ContentProcessingError(f"Error processing file: {str(e)}")
+class WikipediaTool(BaseContentTool):
+    """Tool for processing Wikipedia articles"""
+    name: str = "wikipedia_processor"
+    description: str = "Process Wikipedia articles to extract information"
+    def _run(self, question: str) -> List[Document]:
+        """Process Wikipedia article and create a document with analysis"""
+        try:
+            # Initialize Wikipedia API wrapper
+            wikipedia = WikipediaAPIWrapper()
+            result = wikipedia.run(question)
+            # Create documents with metadata
+            documents = self.text_splitter.create_documents(
+                [result],
+                metadatas=[{
+                    "source": "wikipedia",
+                    "type": "wikipedia",
+                    "question_context": question,
+                    "content_type": "wikipedia_analysis",
+                    "language": "en"
+                }]
+            )
+            return documents
+        except Exception as e:
+            raise ContentProcessingError(f"Error processing Wikipedia article: {str(e)}")
+    async def _arun(self, question: str) -> List[Document]:
+        """Async version of _run"""
+        # Implement if needed
+        raise NotImplementedError("Async version not implemented yet")
+class YouTubeVideoTool(BaseContentTool):
+    """Tool for processing YouTube videos"""
+    name: str = "youtube_video_processor"
+    description: str = "Process YouTube videos to extract information"
+    def _clean_url(self, url: str) -> str:
+        """Clean the URL by removing trailing punctuation and whitespace"""
+        # Remove trailing punctuation and whitespace
+        url = url.strip().rstrip('.!?,;:')
+        # Ensure we have a valid YouTube URL
+        if 'youtu.be' in url:
+            video_id = url.split('/')[-1].split('?')[0]
+            return f"https://www.youtube.com/watch?v={video_id}"
+        return url
+    def _extract_video_id(self, url: str) -> str:
+        """Extract video ID from URL"""
+        if 'youtu.be' in url:
+            return url.split('/')[-1].split('?')[0]
+        elif 'youtube.com' in url:
+            from urllib.parse import parse_qs, urlparse
+            parsed = urlparse(url)
+            return parse_qs(parsed.query)['v'][0]
+        raise VideoProcessingError("Invalid YouTube URL format")
+    def _run(self, video_url: str) -> List[Document]:
+        try:
+            # Clean the URL first
+            clean_url = self._clean_url(video_url)
+            video_id = self._extract_video_id(clean_url)
+            text_content = []
+            metadata = {
+                "source": clean_url,
+                "type": "youtube_video",
+                "video_id": video_id
+            }
+            # Try multiple methods to get video content
+            transcript_success = False
+            video_info_success = False
+            # Method 1: Try to get transcript
+            try:
+                loader = YoutubeLoader.from_youtube_url(
+                    clean_url,
+                    add_video_info=False,  # Set to True if you want to include video metadata
+                    language=["en", "id"],
+                    translation="en"
+                )
+                result = loader.load()
+                print(result)
+                documents = loader.load()
+                print(documents)
+                text_content.extend(documents)
+                with open("transcript.txt", "w", encoding="utf-8") as f:
+                    for doc in documents:
+                        f.write(doc.page_content)
+                transcript_success = True
+            except Exception as e:
+                logging.warning(f"Could not get transcript: {e}")
+                text_content.append("Transcript unavailable")
+            if not transcript_success:
+                error_msg = "Could not access any video content. This might be due to:"
+                error_msg += "\n- Video is private or unavailable"
+                error_msg += "\n- No transcript available"
+                error_msg += "\n- API access restrictions"
+                return [Document(
+                    page_content=error_msg,
+                    metadata=metadata
+                )]
+            # Create documents with metadata
+            text_content = open("transcript.txt", "r", encoding="utf-8").read()
+            return self.text_splitter.create_documents(
+                ["\n".join(text_content)],
+                metadatas=[metadata]
+            )
+        except Exception as e:
+            error_msg = f"Error processing YouTube video: {str(e)}"
+            logging.error(error_msg)
+            return [Document(
+                page_content=error_msg,
+                metadata={"source": video_url, "type": "youtube_video", "error": str(e)}
+            )]
+    async def _arun(self, video_url: str) -> List[Document]:
+        """Async version of _run"""
+        # Implement if needed
+        raise NotImplementedError("Async version not implemented yet")
+class PythonTool(BaseContentTool):
+    """Tool for processing Python files"""
+    name: str = "python_processor"
+    description: str = "Process Python files to extract information"
+    temp_dir: str = Field(default="temp_python")
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        os.makedirs(self.temp_dir, exist_ok=True)
+    def _save_temp_python(self, content: bytes, task_id: str) -> str:
+        """Save Python content to temporary file"""
+        temp_path = os.path.join(self.temp_dir, f"{task_id}.py")
+        try:
+            with open(temp_path, "wb") as f:
+                f.write(content)
+            return temp_path
+        except Exception as e:
+            raise ContentProcessingError(f"Error saving temporary Python file: {str(e)}")
+    def _clean_temp_file(self, file_path: str):
+        """Clean up temporary Python file"""
+        try:
+            if os.path.exists(file_path):
+                os.remove(file_path)
+        except Exception as e:
+            print(f"Warning: Could not remove temporary file {file_path}: {str(e)}")
+    def _run(self, task_id: str) -> List[Document]:
+        """Process Python file and return documents with extracted information"""
+        temp_path = None
+        try:
+            # Get file content using base class method
+            content = self._get_file_from_task_id(task_id, "python")
+            # Save to temporary file for PythonLoader
+            temp_path = self._save_temp_python(content, task_id)
+            # Use PythonLoader to process the file
+            loader = PythonLoader(temp_path)
+            documents = loader.load()
+            # Add metadata to documents
+            for doc in documents:
+                doc.metadata.update({
+                    "source": task_id,
+                    "type": "python",
+                    "content_type": "python_code"
+                })
+            return documents
+        except Exception as e:
+            error_msg = f"Error processing Python file: {str(e)}"
+            logging.error(error_msg)
+            return [Document(
+                page_content=error_msg,
+                metadata={"source": task_id, "type": "python", "error": str(e)}
+            )]
+        finally:
+            # Clean up temporary file
+            if temp_path:
+                self._clean_temp_file(temp_path)
+    async def _arun(self, task_id: str) -> List[Document]:
+        """Async version of _run"""
+        return self._run(task_id)
+class ImageTool(BaseContentTool):
+    """Tool for processing images using GPT-4V"""
+    name: str = "image_processor"
+    description: str = "Process images from task IDs using GPT-4V"
+    client: OpenAI = Field(default_factory=OpenAI)
+    base_system_prompt: str = """You are an expert at analyzing images with strong attention to detail.
+    Your task is to provide a detailed, objective description of the image content.
+    Focus on:
+    1. Key visual elements and their relationships
+    2. Any text or numbers present in the image
+    3. Specific details that might be relevant to answering questions about the image
+    4. Technical or specialized content (diagrams, charts, game positions, etc.)
+    Provide your analysis in a clear, structured format that can be used by a language model to answer specific questions about the image."""
+    def _generate_context_aware_prompt(self, question: str) -> str:
+        """Generate a context-aware system prompt based on the question"""
+        # Extract key information from the question
+        question_lower = question.lower()
+        # Add specialized instructions based on question context
+        specialized_instructions = []
+        if "chess" in question_lower:
+            specialized_instructions.append("""
+            For chess positions:
+            - Describe the position of all pieces using algebraic notation
+            - Note any significant tactical or strategic elements
+            - If asked about moves, specify them in algebraic notation""")
+        if any(word in question_lower for word in ["count", "number", "how many"]):
+            specialized_instructions.append("""
+            Pay special attention to counting and quantifying elements in the image.
+            Provide specific numbers and ensure accuracy in counting.""")
+        if "text" in question_lower or "write" in question_lower or "read" in question_lower:
+            specialized_instructions.append("""
+            Focus on any text content:
+            - Read and transcribe all visible text
+            - Note the location and context of text elements
+            - Pay attention to any numbers, symbols, or special characters""")
+        if "color" in question_lower or "colour" in question_lower:
+            specialized_instructions.append("""
+            Pay special attention to colors:
+            - Describe colors precisely
+            - Note color patterns or relationships
+            - Mention any color-based groupings or distinctions""")
+        # Combine base prompt with specialized instructions
+        full_prompt = self.base_system_prompt
+        if specialized_instructions:
+            full_prompt += "\n\nSpecific focus areas for this image:\n" + "\n".join(specialized_instructions)
+        return full_prompt
+    def _process_image_with_gpt4v(self, image_content: bytes, question: str) -> str:
+        """Process image using GPT-4V API with context from the question"""
+        try:
+            # Convert image to base64
+            base64_image = encode_image_to_base64(image_content)
+            # Generate context-aware system prompt
+            system_prompt = self._generate_context_aware_prompt(question)
+            # Prepare the messages
+            messages = [
+                {
+                    "role": "system",
+                    "content": system_prompt
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}"
+                            }
+                        },
+                        {
+                            "type": "text",
+                            "text": f"Analyze this image in detail, keeping in mind the following question: {question}"
+                        }
+                    ]
+                }
+            ]
+            # Call GPT-4V
+            response = self.client.chat.completions.create(
+                model="gpt-4-vision-preview",
+                messages=messages,
+                max_tokens=500,
+                temperature=0.2  # Lower temperature for more focused analysis
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            raise ImageProcessingError(f"Error processing image with GPT-4V: {str(e)}")
+    def _run(self, task_id: str, question: str = "") -> Document:
+        """Process image and create a document with analysis"""
+        try:
+            # Get image content
+            image_content = self._get_file_from_task_id(task_id, "image")
+            # Process image with GPT-4V
+            analysis = self._process_image_with_gpt4v(image_content, question)
+            # Create document with metadata
+            return Document(
+                page_content=analysis,
+                metadata={
+                    "source": task_id,
+                    "type": "image",
+                    "content_type": "gpt4v_analysis",
+                    "question_context": question
+                }
+            )
+        except Exception as e:
+            raise ImageProcessingError(f"Error processing image: {str(e)}")
+    def _arun(self, task_id: str) -> Document:
+        """Async version of _run"""
+        # Implement if needed
+        raise NotImplementedError("Async version not implemented yet")
+class AudioTool(BaseContentTool):
+    """Tool for processing audio files using Whisper"""
+    name: str = "audio_processor"
+    description: str = "Process audio files from task IDs using Whisper"
+    client: OpenAI = Field(default_factory=OpenAI)
+    temp_dir: str = Field(default="temp_audio")
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        os.makedirs(self.temp_dir, exist_ok=True)
+    def _save_temp_audio(self, audio_content: bytes, task_id: str) -> str:
+        """Save audio content to temporary file"""
+        # Create a temporary file with the task ID as name
+        temp_path = os.path.join(self.temp_dir, f"{task_id}.mp3")
+        try:
+            with open(temp_path, "wb") as f:
+                f.write(audio_content)
+            return temp_path
+        except Exception as e:
+            raise AudioProcessingError(f"Error saving temporary audio file: {str(e)}")
+    def _clean_temp_file(self, file_path: str):
+        """Clean up temporary audio file"""
+        try:
+            if os.path.exists(file_path):
+                os.remove(file_path)
+        except Exception as e:
+            print(f"Warning: Could not remove temporary file {file_path}: {str(e)}")
+    def _transcribe_with_whisper(self, audio_path: str, question: str = "") -> dict:
+        """Transcribe audio using Whisper API"""
+        try:
+            with open(audio_path, "rb") as audio_file:
+                # Determine if we need timestamps based on the question
+                timestamps_needed = any(word in question.lower()
+                                     for word in ["when", "time", "moment", "timestamp"])
+                # Call Whisper API
+                response = self.client.audio.transcriptions.create(
+                    model="whisper-1",
+                    file=audio_file,
+                    response_format="verbose_json",
+                    timestamp_granularities=["segment"] if timestamps_needed else None,
+                    language="en"  # You might want to make this dynamic based on the content
+                )
+                return response
+        except Exception as e:
+            raise AudioProcessingError(f"Error transcribing audio with Whisper: {str(e)}")
+    def _extract_relevant_info(self, transcription: dict, question: str) -> str:
+        """Extract relevant information from transcription based on question"""
+        # Basic question analysis
+        question_lower = question.lower()
+        # Initialize the content parts
+        content_parts = []
+        # Add full transcription
+        if "text" in transcription:
+            content_parts.append(f"Full transcription: {transcription['text']}")
+        # Add timestamps if present and relevant
+        if "segments" in transcription and any(word in question_lower
+            for word in ["when", "time", "moment", "timestamp"]):
+            timestamps = "\n".join([
+                f"[{segment['start']:.2f}s - {segment['end']:.2f}s]: {segment['text']}"
+                for segment in transcription["segments"]
+            ])
+            content_parts.append(f"\nDetailed segments with timestamps:\n{timestamps}")
+        # If looking for specific numbers or lists
+        if any(word in question_lower for word in ["number", "list", "page", "pages"]):
+            # Extract numbers and potential list items
+            import re
+            numbers = re.findall(r'\b\d+\b', transcription["text"])
+            if numbers:
+                content_parts.append(f"\nNumbers mentioned: {', '.join(numbers)}")
+        return "\n".join(content_parts)
+    def _run(self, task_id: str, question: str = "") -> List[Document]:
+        """Process audio file and create a document with transcription"""
+        temp_path = None
+        try:
+            # Get audio content using base class method
+            audio_content = self._get_file_from_task_id(task_id, "audio")
+            # Save to temporary file
+            temp_path = self._save_temp_audio(audio_content, task_id)
+            # Transcribe with Whisper
+            transcription = self._transcribe_with_whisper(temp_path, question)
+            # Extract relevant information
+            processed_content = self._extract_relevant_info(transcription, question)
+            # Create document with metadata
+            return [Document(
+                page_content=processed_content,
+                metadata={
+                    "source": task_id,
+                    "type": "audio",
+                    "content_type": "whisper_transcription",
+                    "question_context": question,
+                    "language": transcription.get("language", "en"),
+                    "duration": transcription.get("duration", None)
+                }
+            )]
+        except Exception as e:
+            error_msg = f"Error processing audio: {str(e)}"
+            logging.error(error_msg)
+            return [Document(
+                page_content=error_msg,
+                metadata={"source": task_id, "type": "audio", "error": str(e)}
+            )]
+        finally:
+            # Clean up temporary file
+            if temp_path:
+                self._clean_temp_file(temp_path)
+    async def _arun(self, task_id: str, question: str = "") -> List[Document]:
+        """Async version of _run"""
+        return self._run(task_id, question)
+class ExcelTool(BaseContentTool):
+    name: str = "excel_tool"
+    description: str = "Tool for processing Excel files and extracting their content"
+    def _process_excel_content(self, content: bytes) -> pd.DataFrame:
+        """Process Excel content and return a pandas DataFrame."""
+        try:
+            return pd.read_excel(io.BytesIO(content))
+        except Exception as e:
+            logging.error(f"Error reading Excel content: {str(e)}")
+            raise ValueError(f"Failed to read Excel content: {str(e)}")
+    def _dataframe_to_text(self, df: pd.DataFrame) -> str:
+        """Convert DataFrame to a readable text format with summary information."""
+        text_parts = []
+        # Basic DataFrame information
+        text_parts.append(f"Total Rows: {len(df)}")
+        text_parts.append(f"Total Columns: {len(df.columns)}")
+        text_parts.append("\nColumns:")
+        text_parts.append(", ".join(df.columns.tolist()))
+        # Data preview (first 50 rows)
+        preview_rows = min(50, len(df))
+        text_parts.append(f"\nData Preview (first {preview_rows} rows):")
+        text_parts.append(df.head(preview_rows).to_string())
+        # Summary statistics for numeric columns
+        numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
+        if len(numeric_cols) > 0:
+            text_parts.append("\nSummary Statistics for Numeric Columns:")
+            text_parts.append(df[numeric_cols].describe().to_string())
+        return "\n".join(text_parts)
+    def _run(self, task_id: str) -> List[Document]:
+        """Process Excel file content and return documents with extracted information."""
+        try:
+            # Get file content using base class method
+            content = self._get_file_from_task_id(task_id, "excel")
+            # Process Excel content
+            df = self._process_excel_content(content)
+            # Convert DataFrame to text
+            text_content = self._dataframe_to_text(df)
+            # Create metadata
+            metadata = {
+                "source": task_id,
+                "content_type": "excel",
+                "row_count": len(df),
+                "column_count": len(df.columns),
+                "columns": df.columns.tolist()
+            }
+            # Create and return document
+            return [Document(
+                page_content=text_content,
+                metadata=metadata
+            )]
+        except Exception as e:
+            error_msg = f"Error processing Excel file: {str(e)}"
+            logging.error(error_msg)
+            return [Document(
+                page_content=error_msg,
+                metadata={"source": task_id, "content_type": "error"}
+            )]
+    async def _arun(self, task_id: str) -> List[Document]:
+        """Async version of _run."""
+        return self._run(task_id)
+class WebContentTool(BaseContentTool):
+    """Tool for web search and content processing"""
+    name: str = "web_content_processor"
+    description: str = "Search the web and process webpage content"
+    search_tool: DuckDuckGoSearchRun = Field(default_factory=DuckDuckGoSearchRun)
+    text_splitter: RecursiveCharacterTextSplitter = Field(default_factory=lambda: RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len,
+        separators=["\n\n", "\n", " ", ""]
+    ))
+    def _run(self, query: str) -> List[Document]:
+        """Search web and process content based on query"""
+        try:
+            # Attempt web search
+            search_result = self.search_tool.invoke(query)
+            if not search_result:
+                raise WebProcessingError("No search results found")
+            # Create documents from search result
+            documents = self.text_splitter.create_documents(
+                [search_result],
+                metadatas=[{
+                    "source": "duckduckgo",
+                    "type": "web_content",
+                    "query": query
+                }]
+            )
+            logging.info(f"Successfully retrieved search results for query: {query[:100]}...")
+            return documents
+        except Exception as e:
+            error_msg = f"Web search failed: {str(e)}"
+            logging.error(error_msg)
+            raise WebProcessingError(error_msg)
+    async def _arun(self, query: str) -> List[Document]:
+        """Async version of _run"""
+        return self._run(query)
+class ChromaDBManager:
+    """Manager for ChromaDB operations"""
+    def __init__(self, persist_directory: str = "./chroma_db"):
+        self.persist_directory = persist_directory
+        self.client = chromadb.Client(Settings(
+            persist_directory=persist_directory,
+            is_persistent=True
+        ))
+    def create_collection(self, name: str):
+        """Create a new collection or get existing one"""
+        try:
+            return self.client.create_collection(name=name)
+        except ValueError:
+            return self.client.get_collection(name=name)
+    def _generate_document_id(self, content: str, metadata: dict) -> str:
+        """Generate a unique ID for a document based on its content and metadata"""
+        # Use content and key metadata fields for ID generation
+        id_parts = [content[:100]]  # First 100 chars of content
+        if metadata:
+            source = metadata.get('source', '')
+            doc_type = metadata.get('type', '')
+            if source:
+                id_parts.append(str(source))
+            if doc_type:
+                id_parts.append(str(doc_type))
+        # Generate hash from combined parts
+        combined = "_".join(id_parts)
+        return f"doc_{hash(combined)}"
+    def add_documents_with_metadata(self, collection_name: str, documents: List[str], metadatas: List[dict]):
+        """Add documents with their metadata to a collection"""
+        if not documents or not metadatas or len(documents) != len(metadatas):
+            raise ValueError("Invalid documents or metadata")
+        collection = self.create_collection(collection_name)
+        # Generate unique IDs for documents
+        ids = [self._generate_document_id(doc, meta)
+               for doc, meta in zip(documents, metadatas)]
+        try:
+            # First try to add documents
+            collection.add(
+                documents=documents,
+                metadatas=metadatas,
+                ids=ids
+            )
+        except Exception as e:
+            # If documents exist, update them
+            logging.info(f"Updating existing documents in collection {collection_name}")
+            collection.upsert(
+                documents=documents,
+                metadatas=metadatas,
+                ids=ids
+            )
+    def query_collection(self, collection_name: str, query: str, n_results: int = 5) -> Dict:
+        """Query a collection with improved retrieval"""
+        try:
+            collection = self.client.get_collection(collection_name)
+            results = collection.query(
+                query_texts=[query],
+                n_results=n_results
+            )
+            # Add debug logging
+            logging.debug(f"Query: {query}")
+            logging.debug(f"Number of results: {len(results['documents'][0]) if results['documents'] else 0}")
+            return results
+        except Exception as e:
+            logging.error(f"Error querying collection {collection_name}: {str(e)}")
+            return {"documents": [], "metadatas": [], "distances": []}