Final_Assignment_Template1

Build error

App Files Files Community

Carolzinha2010 commited on Jul 16, 2025

Commit

bf4c445

verified ·

1 Parent(s): 21d92ed

Create app.py

Browse files

Files changed (1) hide show

app.py +169 -145

app.py CHANGED Viewed

@@ -6,24 +6,32 @@ import requests
 import inspect
 import pandas as pd
 import cv2 # Import opencv-python for video processing
-# Import libraries for SerpAPI and Google Generative AI
 from serpapi import GoogleSearch
-import google.generativeai as genai
 # --- Get API Keys from Environment Variables ---
 # SERPAPI_API_KEY and GOOGLE_API_KEY should be set as secrets in your Hugging Face Space
 SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
 print(f"SERPAPI_API_KEY (first 5 chars): {SERPAPI_API_KEY[:5] if SERPAPI_API_KEY else 'None'}...") # Debugging API key
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 print(f"GOOGLE_API_KEY (first 5 chars): {GOOGLE_API_KEY[:5] if GOOGLE_API_KEY else 'None'}...") # Debugging API key
 # --- Define the default API URL ---
-DEFAULT_API_URL = "https://agent-challenge.hf.space/agent_challenge" # Or the correct API URL if different
 # --- Google Generative AI LLM Initialization ---
 print("Attempting to initialize Google Generative AI model...") # Debugging print before loading
 gemini_model = None # Initialize to None
@@ -49,7 +57,7 @@ else:
 # --- Web Search Function (using SerpAPI) ---
 def web_search(query: str) -> list[dict]:
-    global gemini_model # Ensure global declaration is first
     """
     Performs a web search using SerpAPI and returns relevant information.
@@ -79,6 +87,11 @@ def web_search(query: str) -> list[dict]:
         search_results_dict = search.get_dict() # Get results as a dictionary
         print(f"SerpAPI raw response keys: {search_results_dict.keys() if isinstance(search_results_dict, dict) else 'Response is not a dictionary'}") # Debugging response keys
         # Extract organic results
         # Add check that search_results_dict and organic_results are valid
         if isinstance(search_results_dict, dict) and "organic_results" in search_results_dict and isinstance(search_results_dict["organic_results"], list):
@@ -109,18 +122,17 @@ def web_search(query: str) -> list[dict]:
     return results # Always return a list (empty or with results)
-# --- Basic Agent Definition (Updated to use Google LLM and add video processing) ---
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.") # Debugging print before init
-        # Check if LLM is loaded (optional but good practice)
-        global gemini_model # Access global variable
-        if gemini_model is None:
-             print("Warning: Google Generative AI model not successfully loaded before agent initialization.")
-             # The agent can still perform search but won't use the LLM for synthesis
-        else:
-            print("Google Generative AI model found and ready.") # Debugging print after successful init
     def process_video(self, video_source: str) -> str:
         """
@@ -137,8 +149,11 @@ class BasicAgent:
         cap = None
         try:
             # Attempt to open the video source
             cap = cv2.VideoCapture(video_source)
             # Check if the video was opened successfully
             if not cap.isOpened():
                 print(f"Error: Could not open video source {video_source}")
@@ -177,158 +192,131 @@ class BasicAgent:
                 cap.release()
                 print("Video capture released.")
-    def __call__(self, question: str, video_source: str | None = None) -> str:
-        global gemini_model # Ensure global declaration is first
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         print(f"Video source provided: {video_source}")
-        # --- Check for video processing task ---
         if video_source:
             print("Video source provided. Attempting video processing.")
             video_processing_result = self.process_video(video_source)
-            # For now, the agent just reports on video processing.
-            # Future versions could integrate this with the LLM.
-            return f"Video processing requested. Result: {video_processing_result}"
-        # Simple logic to determine if a web search is needed (only if no video source)
         question_lower = question.lower()
         search_keywords = ["what is", "how to", "where is", "who is", "when did", "define", "explain", "tell me about"]
         needs_search = any(keyword in question_lower for keyword in search_keywords) or "?" in question
         print(f"Needs search: {needs_search}") # Debugging search decision
         # --- Analyze question and refine search query ---
         search_query = question # Default search query is the original question
         if needs_search:
             print("Analyzing question for keywords and refining search query...")
-            # Use LLM to generate search query if available
-            if gemini_model is not None:
-                print("Using LLM to generate search query.")
-                query_prompt = f"""Given the following question, generate the most effective web search query to find information to answer it.
-Focus on extracting key entities and concepts. Do not include question words like "what is" or "how to".
-Question: {question}
-Search Query:"""
-                try:
-                    response = gemini_model.generate_content(query_prompt)
-                    generated_query = response.text.strip()
-                    # Add check for empty or single-word query from LLM
-                    if generated_query and len(generated_query.split()) > 1: # Ensure it's not empty or just one word
-                         search_query = generated_query
-                         print(f"LLM generated search query: {search_query}")
-                    else:
-                        print(f"LLM generated empty or single-word query: '{generated_query}'. Falling back to basic extraction.")
-                        # Fallback to basic extraction if LLM fails
-                        parts = question_lower.split("what is", 1)
-                        if len(parts) > 1:
-                            search_query = parts[1].strip()
-                        else:
-                            parts = question_lower.split("how to", 1)
-                            if len(parts) > 1:
-                                search_query = parts[1].strip()
-                            else:
-                                 parts = question_lower.split("where is", 1)
-                                 if len(parts) > 1:
-                                     search_query = parts[1].strip()
-                                 else:
-                                     parts = question_lower.split("who is", 1)
-                                     if len(parts) > 1:
-                                         search_query = parts[1].strip()
-                                     else:
-                                          parts = question_lower.split("when did", 1)
-                                          if len(parts) > 1:
-                                              search_query = parts[1].strip()
-                                          else:
-                                               parts = question_lower.split("define", 1)
-                                               if len(parts) > 1:
-                                                   search_query = parts[1].strip()
-                                               else:
-                                                    parts = question_lower.split("explain", 1)
-                                                    if len(parts) > 1:
-                                                        search_query = parts[1].strip()
-                                                    else:
-                                                         parts = question_lower.split("tell me about", 1)
-                                                         if len(parts) > 1:
-                                                             search_query = parts[1].strip()
-                                                         else:
-                                                             search_query = question_lower.strip() # Fallback to whole question
-                except Exception as llm_e:
-                    print(f"An error occurred during LLM search query generation: {llm_e}. Falling back to basic extraction.")
-                    # Fallback to basic extraction if LLM call fails
-                    parts = question_lower.split("what is", 1)
-                    if len(parts) > 1:
-                        search_query = parts[1].strip()
-                    else:
-                        parts = question_lower.split("how to", 1)
-                        if len(parts) > 1:
-                            search_query = parts[1].strip()
-                        else:
-                             parts = question_lower.split("where is", 1)
-                             if len(parts) > 1:
-                                 search_query = parts[1].strip()
-                             else:
-                                 parts = question_lower.split("who is", 1)
-                                 if len(parts) > 1:
-                                     search_query = parts[1].strip()
-                                 else:
-                                      parts = question_lower.split("when did", 1)
-                                      if len(parts) > 1:
-                                          search_query = parts[1].strip()
-                                      else:
-                                           parts = question_lower.split("define", 1)
-                                           if len(parts) > 1:
-                                               search_query = parts[1].strip()
-                                           else:
-                                                parts = question_lower.split("explain", 1)
-                                                if len(parts) > 1:
-                                                    search_query = parts[1].strip()
-                                                else:
-                                                     parts = question_lower.split("tell me about", 1)
-                                                     if len(parts) > 1:
-                                                         search_query = parts[1].strip()
-                                                     else:
-                                                         search_query = question_lower.strip() # Fallback to whole question
-            else: # LLM not available for query generation
-                 print("LLM not available for query generation. Using basic search query extraction.")
-                 # Fallback to basic extraction if LLM is not initialized
-                 parts = question_lower.split("what is", 1)
-                 if len(parts) > 1:
-                     search_query = parts[1].strip()
-                 else:
-                     parts = question_lower.split("how to", 1)
                      if len(parts) > 1:
                          search_query = parts[1].strip()
                      else:
-                          parts = question_lower.split("where is", 1)
-                          if len(parts) > 1:
-                              search_query = parts[1].strip()
-                          else:
-                              parts = question_lower.split("who is", 1)
                               if len(parts) > 1:
                                   search_query = parts[1].strip()
                               else:
-                                   parts = question_lower.split("when did", 1)
                                    if len(parts) > 1:
                                        search_query = parts[1].strip()
                                    else:
-                                        parts = question_lower.split("define", 1)
                                         if len(parts) > 1:
                                             search_query = parts[1].strip()
                                         else:
-                                             parts = question_lower.split("explain", 1)
                                              if len(parts) > 1:
                                                  search_query = parts[1].strip()
                                              else:
-                                                  parts = question_lower.split("tell me about", 1)
-                                                  if len(parts) > 1:
-                                                      search_query = parts[1].strip()
-                                                  else:
-                                                      search_query = question_lower.strip() # Fallback to whole question
             # Optional: Add quotation marks for multi-word phrases if identified
@@ -359,8 +347,8 @@ Search Query:"""
                     print(f"An error occurred during web search: {e}")
                     return f"An error occurred during web search: {e}"
-                # --- Use LLM to process search results if available ---
-                # Add check that search_results is a list and not empty before proceeding
                 if isinstance(search_results, list) and search_results and gemini_model is not None:
                     print("Using Google LLM to process search results.") # Debugging print before LLM call
@@ -473,8 +461,8 @@ def run_and_submit_all( profile: gr.OAuthProfile | None, other_arg=None): # Modi
         return "Please Login to Hugging Face with the button.", None
     api_url = DEFAULT_API_URL
-    questions_url = f"{api_url}/questions"
-    submit_url = f"{api_url}/submit"
     # 1. Instantiate Agent ( modify this part to create your agent)
     print("Attempting to instantiate BasicAgent...") # Debugging print before instantiation
@@ -602,6 +590,25 @@ def run_and_submit_all( profile: gr.OAuthProfile | None, other_arg=None): # Modi
         return status_message, results_df
 # Move Gradio interface definition and launch outside the function
 with gr.Blocks(theme=gr.themes.Soft(), title="Basic Agent Evaluation Runner") as demo:
     gr.Markdown(
@@ -614,8 +621,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Basic Agent Evaluation Runner") as
         1. Ensure your agent logic is defined in the `BasicAgent` class above.
         2. **Get a SerpAPI key and a Google AI API key and add them as environment variables in your runtime environment (e.g., as secrets in your Hugging Face Space settings).**
         3. Log in to Hugging Face using the button below.
-        4. Click the "Run Evaluation & Submit All Answers" button.
-        5. The application will fetch questions, run your agent, submit answers, and display the results below.
         """
     )
     login_btn = gr.LoginButton()
@@ -626,18 +633,35 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Basic Agent Evaluation Runner") as
     status_output = gr.Textbox(label="Run Status", interactive=False, lines=5)
     results_output = gr.DataFrame(label="Evaluation Results")
-    # Add Gradio components for video input
-    video_input = gr.Video(label="Upload Video or Paste URL (Optional)")
     run_button.click(
         run_and_submit_all,
-        # Pass the video_input to the function
-        inputs=[login_btn], # Modified to exclude video_input for now as run_and_submit_all doesn't use it
         outputs=[status_output, results_output]
     )
-    # Add a separate button or modify the existing one to handle video processing
-    # For this subtask, we are just adding the video processing capability to the agent,
-    # not fully integrating it into the Gradio submission flow yet.
 # Ensure the app launches when the script is run
 if __name__ == "__main__":

 import inspect
 import pandas as pd
 import cv2 # Import opencv-python for video processing
+import speech_recognition as sr # Import SpeechRecognition for audio processing
+from pydub import AudioSegment # Import pydub for audio manipulation
+import tempfile # Import tempfile for temporary file handling
+import numpy as np # Import numpy for image processing
+# Import libraries for SerpAPI
 from serpapi import GoogleSearch
+import google.generativeai as genai # Keep the import as the user might add LLM functionality back later
 # --- Get API Keys from Environment Variables ---
 # SERPAPI_API_KEY and GOOGLE_API_KEY should be set as secrets in your Hugging Face Space
 SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
 print(f"SERPAPI_API_KEY (first 5 chars): {SERPAPI_API_KEY[:5] if SERPAPI_API_KEY else 'None'}...") # Debugging API key
+# Keep GOOGLE_API_KEY handling as the user might add LLM functionality back later
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 print(f"GOOGLE_API_KEY (first 5 chars): {GOOGLE_API_KEY[:5] if GOOGLE_API_KEY else 'None'}...") # Debugging API key
 # --- Define the default API URL ---
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" # Updated API URL
 # --- Google Generative AI LLM Initialization ---
+# Keep LLM initialization but handle potential errors and None state
 print("Attempting to initialize Google Generative AI model...") # Debugging print before loading
 gemini_model = None # Initialize to None
 # --- Web Search Function (using SerpAPI) ---
 def web_search(query: str) -> list[dict]:
+    # Removed global gemini_model declaration as it's not used here
     """
     Performs a web search using SerpAPI and returns relevant information.
         search_results_dict = search.get_dict() # Get results as a dictionary
         print(f"SerpAPI raw response keys: {search_results_dict.keys() if isinstance(search_results_dict, dict) else 'Response is not a dictionary'}") # Debugging response keys
+        # Log the full SerpAPI response for debugging if organic_results is missing or empty
+        if not isinstance(search_results_dict, dict) or "organic_results" not in search_results_dict or not isinstance(search_results_dict["organic_results"], list) or not search_results_dict["organic_results"]:
+             print(f"SerpAPI response did not contain organic results or had invalid format. Response: {search_results_dict}")
         # Extract organic results
         # Add check that search_results_dict and organic_results are valid
         if isinstance(search_results_dict, dict) and "organic_results" in search_results_dict and isinstance(search_results_dict["organic_results"], list):
     return results # Always return a list (empty or with results)
+# --- Basic Agent Definition (Modified to remove LLM dependency for now) ---
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.") # Debugging print before init
+        # Removed LLM check as it's not used here
+        # global gemini_model # Access global variable
+        # if gemini_model is None:
+        #      print("Warning: Google Generative AI model not successfully loaded before agent initialization.")
+        # else:
+        #     print("Google Generative AI model found and ready.") # Debugging print after successful init
     def process_video(self, video_source: str) -> str:
         """
         cap = None
         try:
             # Attempt to open the video source
+            # Using cv2.CAP_FFMPEG might help with URLs, but requires FFmpeg
+            # cap = cv2.VideoCapture(video_source, cv2.CAP_FFMPEG)
             cap = cv2.VideoCapture(video_source)
             # Check if the video was opened successfully
             if not cap.isOpened():
                 print(f"Error: Could not open video source {video_source}")
                 cap.release()
                 print("Video capture released.")
+    def process_audio(self, audio_source: str) -> str:
+        """
+        Processes an audio source (file path), extracts speech, and performs
+        placeholder audio analysis.
+        Args:
+            audio_source: Path to the audio file.
+        Returns:
+            A string summarizing the audio processing result or an error message.
+        """
+        print(f"Processing audio source: {audio_source}")
+        recognizer = sr.Recognizer()
+        try:
+            # Load the audio file
+            audio = AudioSegment.from_file(audio_source)
+            print(f"Audio loaded. Duration: {len(audio)} ms")
+            # Export to a format SpeechRecognition can handle (e.g., WAV)
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+                audio.export(fp.name, format="wav")
+                temp_wav_file = fp.name
+            print(f"Audio exported to temporary WAV: {temp_wav_file}")
+            # Use SpeechRecognition to transcribe the audio
+            with sr.AudioFile(temp_wav_file) as source:
+                print("Reading audio file for transcription...")
+                audio_data = recognizer.record(source) # read the entire audio file
+                print("Audio data recorded.")
+            # Attempt to recognize speech
+            try:
+                print("Attempting speech recognition...")
+                text = recognizer.recognize_google(audio_data) # Using Google Web Speech API
+                print(f"Transcription result: {text}")
+                return f"Audio processed. Transcription: '{text}'"
+            except sr.UnknownValueError:
+                print("Speech Recognition could not understand audio")
+                return "Audio processed, but could not understand speech."
+            except sr.RequestError as e:
+                print(f"Could not request results from Google Speech Recognition service; {e}")
+                return f"Audio processed, but speech recognition service failed: {e}"
+            except Exception as e:
+                print(f"An unexpected error occurred during speech recognition: {e}")
+                return f"An unexpected error occurred during speech recognition: {e}"
+        except Exception as e:
+            print(f"An error occurred during audio processing: {e}")
+            return f"An error occurred during audio processing: {e}"
+        finally:
+            # Clean up the temporary WAV file
+            if 'temp_wav_file' in locals() and os.path.exists(temp_wav_file):
+                os.remove(temp_wav_file)
+                print(f"Temporary WAV file removed: {temp_wav_file}")
+    def __call__(self, question: str, video_source: str | None = None, audio_source: str | None = None) -> str:
+        # Removed global gemini_model declaration as it's not used here
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         print(f"Video source provided: {video_source}")
+        print(f"Audio source provided: {audio_source}")
+        # --- Check for media processing tasks ---
+        media_processing_results = []
         if video_source:
             print("Video source provided. Attempting video processing.")
             video_processing_result = self.process_video(video_source)
+            media_processing_results.append(f"Video processing result: {video_processing_result}")
+        if audio_source:
+            print("Audio source provided. Attempting audio processing.")
+            audio_processing_result = self.process_audio(audio_source)
+            media_processing_results.append(f"Audio processing result: {audio_processing_result}")
+        # If media was processed, return the results for now
+        if media_processing_results:
+            return "\n".join(media_processing_results)
+        # Simple logic to determine if a web search is needed (only if no media source)
         question_lower = question.lower()
         search_keywords = ["what is", "how to", "where is", "who is", "when did", "define", "explain", "tell me about"]
         needs_search = any(keyword in question_lower for keyword in search_keywords) or "?" in question
         print(f"Needs search: {needs_search}") # Debugging search decision
         # --- Analyze question and refine search query ---
+        # Simplified search query generation - removed LLM query generation
         search_query = question # Default search query is the original question
         if needs_search:
             print("Analyzing question for keywords and refining search query...")
+            # Basic keyword extraction: split by common question words and take the rest
+            parts = question_lower.split("what is", 1)
+            if len(parts) > 1:
+                search_query = parts[1].strip()
+            else:
+                parts = question_lower.split("how to", 1)
+                if len(parts) > 1:
+                    search_query = parts[1].strip()
+                else:
+                     parts = question_lower.split("where is", 1)
                      if len(parts) > 1:
                          search_query = parts[1].strip()
                      else:
+                         parts = question_lower.split("who is", 1)
+                         if len(parts) > 1:
+                             search_query = parts[1].strip()
+                         else:
+                              parts = question_lower.split("when did", 1)
                               if len(parts) > 1:
                                   search_query = parts[1].strip()
                               else:
+                                   parts = question_lower.split("define", 1)
                                    if len(parts) > 1:
                                        search_query = parts[1].strip()
                                    else:
+                                        parts = question_lower.split("explain", 1)
                                         if len(parts) > 1:
                                             search_query = parts[1].strip()
                                         else:
+                                             parts = question_lower.split("tell me about", 1)
                                              if len(parts) > 1:
                                                  search_query = parts[1].strip()
                                              else:
+                                                 # If no specific question keyword found, use the whole question
+                                                 search_query = question_lower.strip()
             # Optional: Add quotation marks for multi-word phrases if identified
                     print(f"An error occurred during web search: {e}")
                     return f"An error occurred during web search: {e}"
+                # --- Use LLM to process search results if available (Removed LLM Synthesis) ---
+                # Check that search_results is a list and is not empty
                 if isinstance(search_results, list) and search_results and gemini_model is not None:
                     print("Using Google LLM to process search results.") # Debugging print before LLM call
         return "Please Login to Hugging Face with the button.", None
     api_url = DEFAULT_API_URL
+    questions_url = f"{api_url}/agent_challenge/questions" # Corrected endpoint
+    submit_url = f"{api_url}/agent_challenge/submit" # Corrected endpoint
     # 1. Instantiate Agent ( modify this part to create your agent)
     print("Attempting to instantiate BasicAgent...") # Debugging print before instantiation
         return status_message, results_df
+# Function to call process_video directly for testing
+def test_video_processing(video_source: str) -> str:
+    print(f"Testing video processing with source: {video_source}")
+    try:
+        agent = BasicAgent()
+        return agent.process_video(video_source)
+    except Exception as e:
+        return f"Error during video processing test: {e}"
+# Function to call process_audio directly for testing
+def test_audio_processing(audio_source: str) -> str:
+    print(f"Testing audio processing with source: {audio_source}")
+    try:
+        agent = BasicAgent()
+        return agent.process_audio(audio_source)
+    except Exception as e:
+        return f"Error during audio processing test: {e}"
 # Move Gradio interface definition and launch outside the function
 with gr.Blocks(theme=gr.themes.Soft(), title="Basic Agent Evaluation Runner") as demo:
     gr.Markdown(
         1. Ensure your agent logic is defined in the `BasicAgent` class above.
         2. **Get a SerpAPI key and a Google AI API key and add them as environment variables in your runtime environment (e.g., as secrets in your Hugging Face Space settings).**
         3. Log in to Hugging Face using the button below.
+        4. Click the "Run Evaluation & Submit All Answers" button to run on predefined questions.
+        5. Use the "Test Video Processing" and "Test Audio Processing" sections to test media analysis.
         """
     )
     login_btn = gr.LoginButton()
     status_output = gr.Textbox(label="Run Status", interactive=False, lines=5)
     results_output = gr.DataFrame(label="Evaluation Results")
     run_button.click(
         run_and_submit_all,
+        inputs=[login_btn], # Pass the profile from the login button
         outputs=[status_output, results_output]
     )
+    gr.Markdown("---") # Separator
+    gr.Markdown("## Test Media Processing")
+    video_test_input = gr.Video(label="Upload Video or Paste URL")
+    video_test_button = gr.Button("Test Video Processing")
+    video_test_output = gr.Textbox(label="Video Processing Result", interactive=False)
+    video_test_button.click(
+        test_video_processing,
+        inputs=[video_test_input],
+        outputs=[video_test_output]
+    )
+    audio_test_input = gr.Audio(label="Upload Audio or Paste URL")
+    audio_test_button = gr.Button("Test Audio Processing")
+    audio_test_output = gr.Textbox(label="Audio Processing Result", interactive=False)
+    audio_test_button.click(
+        test_audio_processing,
+        inputs=[audio_test_input],
+        outputs=[audio_test_output]
+    )
 # Ensure the app launches when the script is run
 if __name__ == "__main__":