Final_Assignment_Template1

Build error

App Files Files Community

Carolzinha2010 commited on Jul 16, 2025

Commit

21d92ed

verified ·

1 Parent(s): 72e96fc

Create app.py

Browse files

Files changed (1) hide show

app.py +144 -166

app.py CHANGED Viewed

@@ -6,23 +6,16 @@ import requests
 import inspect
 import pandas as pd
 import cv2 # Import opencv-python for video processing
-import speech_recognition as sr # Import SpeechRecognition for audio processing
-from pydub import AudioSegment # Import pydub for audio manipulation
-import tempfile # Import tempfile for temporary file handling
-import numpy as np # Import numpy for image processing
-# Import libraries for SerpAPI
 from serpapi import GoogleSearch
-import google.generativeai as genai # Keep the import as the user might add LLM functionality back later
 # --- Get API Keys from Environment Variables ---
 # SERPAPI_API_KEY and GOOGLE_API_KEY should be set as secrets in your Hugging Face Space
 SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
 print(f"SERPAPI_API_KEY (first 5 chars): {SERPAPI_API_KEY[:5] if SERPAPI_API_KEY else 'None'}...") # Debugging API key
-# Keep GOOGLE_API_KEY handling as the user might add LLM functionality back later
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 print(f"GOOGLE_API_KEY (first 5 chars): {GOOGLE_API_KEY[:5] if GOOGLE_API_KEY else 'None'}...") # Debugging API key
@@ -31,7 +24,6 @@ DEFAULT_API_URL = "https://agent-challenge.hf.space/agent_challenge" # Or the co
 # --- Google Generative AI LLM Initialization ---
-# Keep LLM initialization but handle potential errors and None state
 print("Attempting to initialize Google Generative AI model...") # Debugging print before loading
 gemini_model = None # Initialize to None
@@ -57,7 +49,7 @@ else:
 # --- Web Search Function (using SerpAPI) ---
 def web_search(query: str) -> list[dict]:
-    # Removed global gemini_model declaration as it's not used here
     """
     Performs a web search using SerpAPI and returns relevant information.
@@ -87,11 +79,6 @@ def web_search(query: str) -> list[dict]:
         search_results_dict = search.get_dict() # Get results as a dictionary
         print(f"SerpAPI raw response keys: {search_results_dict.keys() if isinstance(search_results_dict, dict) else 'Response is not a dictionary'}") # Debugging response keys
-        # Log the full SerpAPI response for debugging if organic_results is missing or empty
-        if not isinstance(search_results_dict, dict) or "organic_results" not in search_results_dict or not isinstance(search_results_dict["organic_results"], list) or not search_results_dict["organic_results"]:
-             print(f"SerpAPI response did not contain organic results or had invalid format. Response: {search_results_dict}")
         # Extract organic results
         # Add check that search_results_dict and organic_results are valid
         if isinstance(search_results_dict, dict) and "organic_results" in search_results_dict and isinstance(search_results_dict["organic_results"], list):
@@ -109,6 +96,8 @@ def web_search(query: str) -> list[dict]:
                 results.append(item)
         else:
             print(f"No 'organic_results' key found or invalid format in SerpAPI response. Response type: {type(search_results_dict)}")
     except Exception as e:
@@ -120,17 +109,18 @@ def web_search(query: str) -> list[dict]:
     return results # Always return a list (empty or with results)
-# --- Basic Agent Definition (Modified to remove LLM dependency for now) ---
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.") # Debugging print before init
-        # Removed LLM check as it's not used here
-        # global gemini_model # Access global variable
-        # if gemini_model is None:
-        #      print("Warning: Google Generative AI model not successfully loaded before agent initialization.")
-        # else:
-        #     print("Google Generative AI model found and ready.") # Debugging print after successful init
     def process_video(self, video_source: str) -> str:
         """
@@ -147,11 +137,8 @@ class BasicAgent:
         cap = None
         try:
             # Attempt to open the video source
-            # Using cv2.CAP_FFMPEG might help with URLs, but requires FFmpeg
-            # cap = cv2.VideoCapture(video_source, cv2.CAP_FFMPEG)
             cap = cv2.VideoCapture(video_source)
             # Check if the video was opened successfully
             if not cap.isOpened():
                 print(f"Error: Could not open video source {video_source}")
@@ -190,131 +177,158 @@ class BasicAgent:
                 cap.release()
                 print("Video capture released.")
-    def process_audio(self, audio_source: str) -> str:
-        """
-        Processes an audio source (file path), extracts speech, and performs
-        placeholder audio analysis.
-        Args:
-            audio_source: Path to the audio file.
-        Returns:
-            A string summarizing the audio processing result or an error message.
-        """
-        print(f"Processing audio source: {audio_source}")
-        recognizer = sr.Recognizer()
-        try:
-            # Load the audio file
-            audio = AudioSegment.from_file(audio_source)
-            print(f"Audio loaded. Duration: {len(audio)} ms")
-            # Export to a format SpeechRecognition can handle (e.g., WAV)
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
-                audio.export(fp.name, format="wav")
-                temp_wav_file = fp.name
-            print(f"Audio exported to temporary WAV: {temp_wav_file}")
-            # Use SpeechRecognition to transcribe the audio
-            with sr.AudioFile(temp_wav_file) as source:
-                print("Reading audio file for transcription...")
-                audio_data = recognizer.record(source) # read the entire audio file
-                print("Audio data recorded.")
-            # Attempt to recognize speech
-            try:
-                print("Attempting speech recognition...")
-                text = recognizer.recognize_google(audio_data) # Using Google Web Speech API
-                print(f"Transcription result: {text}")
-                return f"Audio processed. Transcription: '{text}'"
-            except sr.UnknownValueError:
-                print("Speech Recognition could not understand audio")
-                return "Audio processed, but could not understand speech."
-            except sr.RequestError as e:
-                print(f"Could not request results from Google Speech Recognition service; {e}")
-                return f"Audio processed, but speech recognition service failed: {e}"
-            except Exception as e:
-                print(f"An unexpected error occurred during speech recognition: {e}")
-                return f"An unexpected error occurred during speech recognition: {e}"
-        except Exception as e:
-            print(f"An error occurred during audio processing: {e}")
-            return f"An error occurred during audio processing: {e}"
-        finally:
-            # Clean up the temporary WAV file
-            if 'temp_wav_file' in locals() and os.path.exists(temp_wav_file):
-                os.remove(temp_wav_file)
-                print(f"Temporary WAV file removed: {temp_wav_file}")
-    def __call__(self, question: str, video_source: str | None = None, audio_source: str | None = None) -> str:
-        # Removed global gemini_model declaration as it's not used here
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         print(f"Video source provided: {video_source}")
-        print(f"Audio source provided: {audio_source}")
-        # --- Check for media processing tasks ---
-        media_processing_results = []
         if video_source:
             print("Video source provided. Attempting video processing.")
             video_processing_result = self.process_video(video_source)
-            media_processing_results.append(f"Video processing result: {video_processing_result}")
-        if audio_source:
-            print("Audio source provided. Attempting audio processing.")
-            audio_processing_result = self.process_audio(audio_source)
-            media_processing_results.append(f"Audio processing result: {audio_processing_result}")
-        # If media was processed, return the results for now
-        if media_processing_results:
-            return "\n".join(media_processing_results)
-        # Simple logic to determine if a web search is needed (only if no media source)
         question_lower = question.lower()
         search_keywords = ["what is", "how to", "where is", "who is", "when did", "define", "explain", "tell me about"]
         needs_search = any(keyword in question_lower for keyword in search_keywords) or "?" in question
         print(f"Needs search: {needs_search}") # Debugging search decision
         # --- Analyze question and refine search query ---
-        # Simplified search query generation - removed LLM query generation
         search_query = question # Default search query is the original question
         if needs_search:
             print("Analyzing question for keywords and refining search query...")
-            # Basic keyword extraction: split by common question words and take the rest
-            parts = question_lower.split("what is", 1)
-            if len(parts) > 1:
-                search_query = parts[1].strip()
-            else:
-                parts = question_lower.split("how to", 1)
-                if len(parts) > 1:
-                    search_query = parts[1].strip()
-                else:
-                     parts = question_lower.split("where is", 1)
                      if len(parts) > 1:
                          search_query = parts[1].strip()
                      else:
-                         parts = question_lower.split("who is", 1)
-                         if len(parts) > 1:
-                             search_query = parts[1].strip()
-                         else:
-                              parts = question_lower.split("when did", 1)
                               if len(parts) > 1:
                                   search_query = parts[1].strip()
                               else:
-                                   parts = question_lower.split("define", 1)
                                    if len(parts) > 1:
                                        search_query = parts[1].strip()
                                    else:
-                                        parts = question_lower.split("explain", 1)
                                         if len(parts) > 1:
                                             search_query = parts[1].strip()
                                         else:
-                                             parts = question_lower.split("tell me about", 1)
                                              if len(parts) > 1:
                                                  search_query = parts[1].strip()
                                              else:
-                                                 # If no specific question keyword found, use the whole question
-                                                 search_query = question_lower.strip()
             # Optional: Add quotation marks for multi-word phrases if identified
@@ -345,8 +359,8 @@ class BasicAgent:
                     print(f"An error occurred during web search: {e}")
                     return f"An error occurred during web search: {e}"
-                # --- Use LLM to process search results if available (Removed LLM Synthesis) ---
-                # Check that search_results is a list and is not empty
                 if isinstance(search_results, list) and search_results and gemini_model is not None:
                     print("Using Google LLM to process search results.") # Debugging print before LLM call
@@ -588,25 +602,6 @@ def run_and_submit_all( profile: gr.OAuthProfile | None, other_arg=None): # Modi
         return status_message, results_df
-# Function to call process_video directly for testing
-def test_video_processing(video_source: str) -> str:
-    print(f"Testing video processing with source: {video_source}")
-    try:
-        agent = BasicAgent()
-        return agent.process_video(video_source)
-    except Exception as e:
-        return f"Error during video processing test: {e}"
-# Function to call process_audio directly for testing
-def test_audio_processing(audio_source: str) -> str:
-    print(f"Testing audio processing with source: {audio_source}")
-    try:
-        agent = BasicAgent()
-        return agent.process_audio(audio_source)
-    except Exception as e:
-        return f"Error during audio processing test: {e}"
 # Move Gradio interface definition and launch outside the function
 with gr.Blocks(theme=gr.themes.Soft(), title="Basic Agent Evaluation Runner") as demo:
     gr.Markdown(
@@ -619,8 +614,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Basic Agent Evaluation Runner") as
         1. Ensure your agent logic is defined in the `BasicAgent` class above.
         2. **Get a SerpAPI key and a Google AI API key and add them as environment variables in your runtime environment (e.g., as secrets in your Hugging Face Space settings).**
         3. Log in to Hugging Face using the button below.
-        4. Click the "Run Evaluation & Submit All Answers" button to run on predefined questions.
-        5. Use the "Test Video Processing" and "Test Audio Processing" sections to test media analysis.
         """
     )
     login_btn = gr.LoginButton()
@@ -631,35 +626,18 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Basic Agent Evaluation Runner") as
     status_output = gr.Textbox(label="Run Status", interactive=False, lines=5)
     results_output = gr.DataFrame(label="Evaluation Results")
     run_button.click(
         run_and_submit_all,
-        inputs=[login_btn], # Pass the profile from the login button
         outputs=[status_output, results_output]
     )
-    gr.Markdown("---") # Separator
-    gr.Markdown("## Test Media Processing")
-    video_test_input = gr.Video(label="Upload Video or Paste URL")
-    video_test_button = gr.Button("Test Video Processing")
-    video_test_output = gr.Textbox(label="Video Processing Result", interactive=False)
-    video_test_button.click(
-        test_video_processing,
-        inputs=[video_test_input],
-        outputs=[video_test_output]
-    )
-    audio_test_input = gr.Audio(label="Upload Audio or Paste URL")
-    audio_test_button = gr.Button("Test Audio Processing")
-    audio_test_output = gr.Textbox(label="Audio Processing Result", interactive=False)
-    audio_test_button.click(
-        test_audio_processing,
-        inputs=[audio_test_input],
-        outputs=[audio_test_output]
-    )
 # Ensure the app launches when the script is run
 if __name__ == "__main__":

 import inspect
 import pandas as pd
 import cv2 # Import opencv-python for video processing
+# Import libraries for SerpAPI and Google Generative AI
 from serpapi import GoogleSearch
+import google.generativeai as genai
 # --- Get API Keys from Environment Variables ---
 # SERPAPI_API_KEY and GOOGLE_API_KEY should be set as secrets in your Hugging Face Space
 SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
 print(f"SERPAPI_API_KEY (first 5 chars): {SERPAPI_API_KEY[:5] if SERPAPI_API_KEY else 'None'}...") # Debugging API key
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 print(f"GOOGLE_API_KEY (first 5 chars): {GOOGLE_API_KEY[:5] if GOOGLE_API_KEY else 'None'}...") # Debugging API key
 # --- Google Generative AI LLM Initialization ---
 print("Attempting to initialize Google Generative AI model...") # Debugging print before loading
 gemini_model = None # Initialize to None
 # --- Web Search Function (using SerpAPI) ---
 def web_search(query: str) -> list[dict]:
+    global gemini_model # Ensure global declaration is first
     """
     Performs a web search using SerpAPI and returns relevant information.
         search_results_dict = search.get_dict() # Get results as a dictionary
         print(f"SerpAPI raw response keys: {search_results_dict.keys() if isinstance(search_results_dict, dict) else 'Response is not a dictionary'}") # Debugging response keys
         # Extract organic results
         # Add check that search_results_dict and organic_results are valid
         if isinstance(search_results_dict, dict) and "organic_results" in search_results_dict and isinstance(search_results_dict["organic_results"], list):
                 results.append(item)
         else:
             print(f"No 'organic_results' key found or invalid format in SerpAPI response. Response type: {type(search_results_dict)}")
+            # Print the whole response if no organic_results are found for debugging
+            # print(f"SerpAPI response (no organic results): {search_results_dict}")
     except Exception as e:
     return results # Always return a list (empty or with results)
+# --- Basic Agent Definition (Updated to use Google LLM and add video processing) ---
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.") # Debugging print before init
+        # Check if LLM is loaded (optional but good practice)
+        global gemini_model # Access global variable
+        if gemini_model is None:
+             print("Warning: Google Generative AI model not successfully loaded before agent initialization.")
+             # The agent can still perform search but won't use the LLM for synthesis
+        else:
+            print("Google Generative AI model found and ready.") # Debugging print after successful init
     def process_video(self, video_source: str) -> str:
         """
         cap = None
         try:
             # Attempt to open the video source
             cap = cv2.VideoCapture(video_source)
             # Check if the video was opened successfully
             if not cap.isOpened():
                 print(f"Error: Could not open video source {video_source}")
                 cap.release()
                 print("Video capture released.")
+    def __call__(self, question: str, video_source: str | None = None) -> str:
+        global gemini_model # Ensure global declaration is first
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         print(f"Video source provided: {video_source}")
+        # --- Check for video processing task ---
         if video_source:
             print("Video source provided. Attempting video processing.")
             video_processing_result = self.process_video(video_source)
+            # For now, the agent just reports on video processing.
+            # Future versions could integrate this with the LLM.
+            return f"Video processing requested. Result: {video_processing_result}"
+        # Simple logic to determine if a web search is needed (only if no video source)
         question_lower = question.lower()
         search_keywords = ["what is", "how to", "where is", "who is", "when did", "define", "explain", "tell me about"]
         needs_search = any(keyword in question_lower for keyword in search_keywords) or "?" in question
         print(f"Needs search: {needs_search}") # Debugging search decision
         # --- Analyze question and refine search query ---
         search_query = question # Default search query is the original question
         if needs_search:
             print("Analyzing question for keywords and refining search query...")
+            # Use LLM to generate search query if available
+            if gemini_model is not None:
+                print("Using LLM to generate search query.")
+                query_prompt = f"""Given the following question, generate the most effective web search query to find information to answer it.
+Focus on extracting key entities and concepts. Do not include question words like "what is" or "how to".
+Question: {question}
+Search Query:"""
+                try:
+                    response = gemini_model.generate_content(query_prompt)
+                    generated_query = response.text.strip()
+                    # Add check for empty or single-word query from LLM
+                    if generated_query and len(generated_query.split()) > 1: # Ensure it's not empty or just one word
+                         search_query = generated_query
+                         print(f"LLM generated search query: {search_query}")
+                    else:
+                        print(f"LLM generated empty or single-word query: '{generated_query}'. Falling back to basic extraction.")
+                        # Fallback to basic extraction if LLM fails
+                        parts = question_lower.split("what is", 1)
+                        if len(parts) > 1:
+                            search_query = parts[1].strip()
+                        else:
+                            parts = question_lower.split("how to", 1)
+                            if len(parts) > 1:
+                                search_query = parts[1].strip()
+                            else:
+                                 parts = question_lower.split("where is", 1)
+                                 if len(parts) > 1:
+                                     search_query = parts[1].strip()
+                                 else:
+                                     parts = question_lower.split("who is", 1)
+                                     if len(parts) > 1:
+                                         search_query = parts[1].strip()
+                                     else:
+                                          parts = question_lower.split("when did", 1)
+                                          if len(parts) > 1:
+                                              search_query = parts[1].strip()
+                                          else:
+                                               parts = question_lower.split("define", 1)
+                                               if len(parts) > 1:
+                                                   search_query = parts[1].strip()
+                                               else:
+                                                    parts = question_lower.split("explain", 1)
+                                                    if len(parts) > 1:
+                                                        search_query = parts[1].strip()
+                                                    else:
+                                                         parts = question_lower.split("tell me about", 1)
+                                                         if len(parts) > 1:
+                                                             search_query = parts[1].strip()
+                                                         else:
+                                                             search_query = question_lower.strip() # Fallback to whole question
+                except Exception as llm_e:
+                    print(f"An error occurred during LLM search query generation: {llm_e}. Falling back to basic extraction.")
+                    # Fallback to basic extraction if LLM call fails
+                    parts = question_lower.split("what is", 1)
+                    if len(parts) > 1:
+                        search_query = parts[1].strip()
+                    else:
+                        parts = question_lower.split("how to", 1)
+                        if len(parts) > 1:
+                            search_query = parts[1].strip()
+                        else:
+                             parts = question_lower.split("where is", 1)
+                             if len(parts) > 1:
+                                 search_query = parts[1].strip()
+                             else:
+                                 parts = question_lower.split("who is", 1)
+                                 if len(parts) > 1:
+                                     search_query = parts[1].strip()
+                                 else:
+                                      parts = question_lower.split("when did", 1)
+                                      if len(parts) > 1:
+                                          search_query = parts[1].strip()
+                                      else:
+                                           parts = question_lower.split("define", 1)
+                                           if len(parts) > 1:
+                                               search_query = parts[1].strip()
+                                           else:
+                                                parts = question_lower.split("explain", 1)
+                                                if len(parts) > 1:
+                                                    search_query = parts[1].strip()
+                                                else:
+                                                     parts = question_lower.split("tell me about", 1)
+                                                     if len(parts) > 1:
+                                                         search_query = parts[1].strip()
+                                                     else:
+                                                         search_query = question_lower.strip() # Fallback to whole question
+            else: # LLM not available for query generation
+                 print("LLM not available for query generation. Using basic search query extraction.")
+                 # Fallback to basic extraction if LLM is not initialized
+                 parts = question_lower.split("what is", 1)
+                 if len(parts) > 1:
+                     search_query = parts[1].strip()
+                 else:
+                     parts = question_lower.split("how to", 1)
                      if len(parts) > 1:
                          search_query = parts[1].strip()
                      else:
+                          parts = question_lower.split("where is", 1)
+                          if len(parts) > 1:
+                              search_query = parts[1].strip()
+                          else:
+                              parts = question_lower.split("who is", 1)
                               if len(parts) > 1:
                                   search_query = parts[1].strip()
                               else:
+                                   parts = question_lower.split("when did", 1)
                                    if len(parts) > 1:
                                        search_query = parts[1].strip()
                                    else:
+                                        parts = question_lower.split("define", 1)
                                         if len(parts) > 1:
                                             search_query = parts[1].strip()
                                         else:
+                                             parts = question_lower.split("explain", 1)
                                              if len(parts) > 1:
                                                  search_query = parts[1].strip()
                                              else:
+                                                  parts = question_lower.split("tell me about", 1)
+                                                  if len(parts) > 1:
+                                                      search_query = parts[1].strip()
+                                                  else:
+                                                      search_query = question_lower.strip() # Fallback to whole question
             # Optional: Add quotation marks for multi-word phrases if identified
                     print(f"An error occurred during web search: {e}")
                     return f"An error occurred during web search: {e}"
+                # --- Use LLM to process search results if available ---
+                # Add check that search_results is a list and not empty before proceeding
                 if isinstance(search_results, list) and search_results and gemini_model is not None:
                     print("Using Google LLM to process search results.") # Debugging print before LLM call
         return status_message, results_df
 # Move Gradio interface definition and launch outside the function
 with gr.Blocks(theme=gr.themes.Soft(), title="Basic Agent Evaluation Runner") as demo:
     gr.Markdown(
         1. Ensure your agent logic is defined in the `BasicAgent` class above.
         2. **Get a SerpAPI key and a Google AI API key and add them as environment variables in your runtime environment (e.g., as secrets in your Hugging Face Space settings).**
         3. Log in to Hugging Face using the button below.
+        4. Click the "Run Evaluation & Submit All Answers" button.
+        5. The application will fetch questions, run your agent, submit answers, and display the results below.
         """
     )
     login_btn = gr.LoginButton()
     status_output = gr.Textbox(label="Run Status", interactive=False, lines=5)
     results_output = gr.DataFrame(label="Evaluation Results")
+    # Add Gradio components for video input
+    video_input = gr.Video(label="Upload Video or Paste URL (Optional)")
     run_button.click(
         run_and_submit_all,
+        # Pass the video_input to the function
+        inputs=[login_btn], # Modified to exclude video_input for now as run_and_submit_all doesn't use it
         outputs=[status_output, results_output]
     )
+    # Add a separate button or modify the existing one to handle video processing
+    # For this subtask, we are just adding the video processing capability to the agent,
+    # not fully integrating it into the Gradio submission flow yet.
 # Ensure the app launches when the script is run
 if __name__ == "__main__":