HF_Final_Assignment_Template

Sleeping

App Files Files Community

Samuel Thomas commited on May 9, 2025

Commit

82de5c7

1 Parent(s): da40168

changes to model

Browse files

Files changed (2) hide show

app.py +1 -1
tools.py +314 -200

app.py CHANGED Viewed

@@ -100,7 +100,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
                 task_id = hf_questions[r]['task_id']
                 question_text = hf_questions[r]['question']
                 submitted_answer = intelligent_agent(s)
-                answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
                 results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
             except:
                 print(f"Error running agent on task {task_id}: {e}")

                 task_id = hf_questions[r]['task_id']
                 question_text = hf_questions[r]['question']
                 submitted_answer = intelligent_agent(s)
+                answers_payload.append({"task_id": task_id, "model_answer": submitted_answer})
                 results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
             except:
                 print(f"Error running agent on task {task_id}: {e}")

tools.py CHANGED Viewed

@@ -22,6 +22,7 @@ from langchain.schema import Document
 from transformers import BlipProcessor, BlipForQuestionAnswering, pipeline
 from io import BytesIO
 from sentence_transformers import SentenceTransformer
 import os
@@ -84,8 +85,8 @@ def write_bytes_to_temp_dir(file_bytes: bytes, file_name: str) -> str:
 class State(TypedDict, total=False):
     question: str
     task_id: str
-    input_file: bytes
-    file_type: str
     context: List[Document]  # Using LangChain's Document class
     file_path: Optional[str]
     youtube_url: Optional[str]
@@ -94,31 +95,33 @@ class State(TypedDict, total=False):
     next: Optional[str]  # Added to track the next node
 # --- LLM pipeline for general questions ---
-llm_pipe = pipeline("text-generation",
-                    #model="meta-llama/Llama-3.3-70B-Instruct",
-                    #model="meta-llama/Meta-Llama-3-8B-Instruct",
-                    #model="Qwen/Qwen2-7B-Instruct",
-                    #model="microsoft/Phi-4-reasoning",
-                    model="microsoft/Phi-3-mini-4k-instruct",
-                    device_map="auto",
-                    #device_map={ "": 0 },  # "" means the whole model
-                    #max_memory={0: "10GiB"},
-                    torch_dtype="auto",
-                    max_new_tokens=256)
 # Speech-to-text pipeline
 asr_pipe = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-small",
-    device=-1
-    #device_map={"", 0},
-    #max_memory = {0: "4.5GiB"},
-    #device_map="auto"
 )
-# --- Your BLIP VQA setup ---
-#device = "cuda" if torch.cuda.is_available() else "cpu"
-device = "cpu"
 vqa_model_name = "Salesforce/blip-vqa-base"
 processor_vqa = BlipProcessor.from_pretrained(vqa_model_name)
@@ -130,18 +133,47 @@ except torch.cuda.OutOfMemoryError:
     device = "cpu"  # Switch device to CPU
     model_vqa = BlipForQuestionAnswering.from_pretrained(vqa_model_name).to(device)
-# --- Helper: Answer question on a single frame ---
 def answer_question_on_frame(image_path, question):
-    # Fixed: Properly use the PIL Image module
-    image = Image.open(image_path).convert('RGB')
-    inputs = processor_vqa(image, question, return_tensors="pt").to(device)
-    out = model_vqa.generate(**inputs)
-    answer = processor_vqa.decode(out[0], skip_special_tokens=True)
-    return answer
-# --- Helper: Answer question about the whole video ---
 def answer_video_question(frames_dir, question):
     valid_exts = ('.jpg', '.jpeg', '.png')
     # Check if directory exists
@@ -193,8 +225,8 @@ def answer_video_question(frames_dir, question):
         "answer_counts": counted
     }
-def download_youtube_video(url, output_dir='tmp/content/video/', output_filename='downloaded_video.mp4'):
     # Ensure the output directory exists
     os.makedirs(output_dir, exist_ok=True)
@@ -209,25 +241,27 @@ def download_youtube_video(url, output_dir='tmp/content/video/', output_filename
     # Set output path for yt-dlp
     output_path = os.path.join(output_dir, output_filename)
-    ydl_opts = {
-        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
-        'outtmpl': output_path,
-        'quiet': True,
-        'merge_output_format': 'mp4',  # Ensures merged output is mp4
-        'postprocessors': [{
-            'key': 'FFmpegVideoConvertor',
-            'preferedformat': 'mp4',  # Recode if needed
-        }]
-    }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        ydl.download([url])
-    return output_path
-# --- Helper: Extract frames from video ---
 def extract_frames(video_path, output_dir, frame_interval_seconds=10):
-    # --- Clean output directory before extracting new frames ---
     if os.path.exists(output_dir):
         for filename in os.listdir(output_dir):
             file_path = os.path.join(output_dir, filename)
@@ -266,33 +300,23 @@ def extract_frames(video_path, output_dir, frame_interval_seconds=10):
         print(f"Exception during frame extraction: {e}")
         return False
-def image_qa(image_path: str, question: str, model_name: str = vqa_model_name) -> str:
-    """
-    Answers questions about images using Hugging Face's VQA pipeline.
-    Args:
-        image_path: Path to local image file or URL
-        question: Natural language question about the image
-        model_name: Pretrained VQA model (default: good general-purpose model)
-    Returns:
-        str: The model's best answer
-    """
-    # Create VQA pipeline with specified model
-    vqa_pipeline = pipeline("visual-question-answering", model=model_name)
-    # Get predictions (automatically handles local files/URLs)
-    results = vqa_pipeline(image=image_path, question=question, top_k=1)
-    # Return top answer
-    return results[0]['answer']
 def router(state: Dict[str, Any]) -> str:
-    """Determine the next node based on whether the question contains a YouTube URL or references Wikipedia."""
     question = state.get('question', '')
     # Pattern for Wikipedia and similar sources
     wiki_pattern = r"(wikipedia\.org|wiki|encyclopedia|britannica\.com|encyclop[a|æ]dia)"
     has_wiki = re.search(wiki_pattern, question, re.IGNORECASE) is not None
@@ -327,30 +351,52 @@ def router(state: Dict[str, Any]) -> str:
     else:
         return "llm"
-# --- Node Implementation ---
-def node_image(state: Dict[str, Any]) -> Dict[str, Any]:
-  """Router node that decides which node to go to next."""
-  print("Running node_image")
-  # Add the next state to the state dict
-  img = Image.open(state['file_path'])
-  state['answer'] = image_qa(state['file_path'], state['question'])
-  return state
 def node_decide(state: Dict[str, Any]) -> Dict[str, Any]:
-    """Router node that decides which node to go to next."""
     print("Running node_decide")
     # Add the next state to the state dict
     state["next"] = router(state)
     print(f"Routing to: {state['next']}")
     return state
 def node_video(state: Dict[str, Any]) -> Dict[str, Any]:
     print("Running node_video")
     youtube_url = state.get('youtube_url')
     if not youtube_url:
-        state['answer'] = "No YouTube URL found in the question."
         return state
     question = state['question']
@@ -361,7 +407,7 @@ def node_video(state: Dict[str, Any]) -> Dict[str, Any]:
     video_file = download_youtube_video(youtube_url)
     if not video_file or not os.path.exists(video_file):
-        state['answer'] = "Failed to download the video."
         return state
     frames_dir = "/tmp/frames"
@@ -369,11 +415,11 @@ def node_video(state: Dict[str, Any]) -> Dict[str, Any]:
     success = extract_frames(video_path=video_file, output_dir=frames_dir, frame_interval_seconds=10)
     if not success:
-        state['answer'] = "Failed to extract frames from the video."
         return state
     result = answer_video_question(frames_dir, question_text)
-    state['answer'] = result['most_common_answer']
     state['frame_answers'] = result['all_answers']
     # Create Document objects for each frame analysis
@@ -385,15 +431,15 @@ def node_video(state: Dict[str, Any]) -> Dict[str, Any]:
         )
         frame_documents.append(doc)
-    # Add documents to state if not already present
-    if 'context' not in state:
-        state['context'] = []
     state['context'].extend(frame_documents)
     print(f"Video answer: {state['answer']}")
     return state
 def node_audio_rag(state: Dict[str, Any]) -> Dict[str, Any]:
     print(f"Processing audio file: {state['file_path']}")
     try:
@@ -403,52 +449,65 @@ def node_audio_rag(state: Dict[str, Any]) -> Dict[str, Any]:
         audio_transcript = asr_result['text']
         print(f"Audio transcript: {audio_transcript}")
-        # Step 2: Store ONLY the transcript in the vector store
         transcript_doc = [Document(page_content=audio_transcript)]
         embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-large-en-v1.5')
         vector_db = FAISS.from_documents(transcript_doc, embedding=embeddings)
         # Step 3: Retrieve relevant docs for the user's question
         question = state['question']
-        similar_docs = vector_db.similarity_search(question, k=1)  # Only one doc in store
         retrieved_context = "\n".join([doc.page_content for doc in similar_docs])
-        # Step 4: Augment prompt and generate answer
         prompt = (
-            f"Use the following context to answer the question.\n"
-            f"Context:\n{retrieved_context}\n\n"
-            f"Question: {question}\nAnswer:"
         )
         llm_response = llm_pipe(prompt)
-        state['answer'] = llm_response[0]['generated_text']
     except Exception as e:
         error_msg = f"Audio processing error: {str(e)}"
         print(error_msg)
-        state['answer'] = error_msg
     return state
 def node_llm(state: Dict[str, Any]) -> Dict[str, Any]:
     print("Running node_llm")
     question = state['question']
-    # Optionally add context from state (e.g., Wikipedia/Wikidata content)
-    context_text = ""
-    if 'article_content' in state and state['article_content']:
-        context_text = f"\n\nBackground Information:\n{state['article_content']}\n"
-    elif 'context' in state and state['context']:
-        context_text = "\n\n".join([doc.page_content for doc in state['context']])
     # Compose a detailed prompt
     prompt = (
-        "You are an expert researcher. Answer the user's question as accurately as possible. "
-        "If the text appears to be scrambled, try to unscramble the text for the user"
-        "If the information is incomplete or ambiguous, provide your best estimate based on the available evidence, and clearly explain any assumptions or reasoning you use. "
-        "If the answer requires multiple steps or deeper analysis, break down the question into sub-questions and answer them step by step, citing the relevant context for each step.\n\n"
-        f"Question: {question}"
-        f"{context_text}\n"
-        "Answer:"
     )
     # Add document to state for traceability
@@ -456,102 +515,138 @@ def node_llm(state: Dict[str, Any]) -> Dict[str, Any]:
         page_content=prompt,
         metadata={"source": "llm_prompt"}
     )
-    if 'context' not in state:
-        state['context'] = []
     state['context'].append(query_doc)
     try:
         result = llm_pipe(prompt)
-        state['answer'] = result[0]['generated_text']
     except Exception as e:
         print(f"Error in LLM processing: {str(e)}")
-        state['answer'] = f"An error occurred while processing your question: {str(e)}"
     print(f"LLM answer: {state['answer']}")
     return state
 # --- Define the edge condition function ---
 def get_next_node(state: Dict[str, Any]) -> str:
-    """Get the next node from the state."""
     return state["next"]
-# 2. Improved Wikipedia Retrieval Node
-def extract_keywords(question: str) -> List[str]:
-    doc = nlp(question)
-    keywords = [token.text for token in doc if token.pos_ in ("PROPN", "NOUN")]  # Extract proper nouns and nouns
-    return keywords
-def extract_entities(question: str) -> List[str]:
-    doc = nlp(question)
-    entities = [ent.text for ent in doc.ents]
-    return entities if entities else [token.text for token in doc if token.pos_ in ("PROPN", "NOUN")]
-def retrieve(state: State) -> dict:
-    keywords = extract_entities(state["question"])
-    query = " ".join(keywords)
-    search_results = wikipedia.search(query)
-    selected_page = search_results[0] if search_results else None
-    if selected_page:
-        loader = WikipediaLoader(
-            query=selected_page,
-            lang="en",
-            load_max_docs=1,
-            doc_content_chars_max=100000,
-            load_all_available_meta=True
-        )
-        docs = loader.load()
-        # Chunk the article for finer retrieval
-        from langchain.text_splitter import RecursiveCharacterTextSplitter
-        splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
-        all_chunks = []
-        for doc in docs:
-            chunks = splitter.split_text(doc.page_content)
-            all_chunks.extend([Document(page_content=chunk) for chunk in chunks])
-        # Optionally: re-rank or filter chunks here
-        return {"context": all_chunks}
-    else:
-        return {"context": []}
-# 3. Prompt Template for General QA
-prompt = PromptTemplate(
-    input_variables=["question", "context"],
-    template=(
-        "You are an expert researcher. Given the following context from Wikipedia, answer the user's question as accurately as possible. "
-        "If the text appears to be scrambled, try to unscramble the text for the user"
-        "If the information is incomplete or ambiguous, provide your best estimate based on the available evidence, and clearly explain any assumptions or reasoning you use. "
-        "If the answer requires multiple steps or deeper analysis, break down the question into sub-questions and answer them step by step, citing the relevant context for each step."
-        "Context:\n{context}\n\n"
-        "Question: {question}\n\n"
-        "Best Estimate Answer:"
-    )
-)
-"""
-def generate(state: State) -> dict:
-    # Concatenate all context documents into a single string
-    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
-    # Format the prompt for the LLM
-    prompt_str = prompt.format(question=state["question"], context=docs_content)
-    # Generate answer
-    response = llm.invoke(prompt_str)
-    return {"answer": response}
-"""
-def generate(state: dict) -> dict:
-    # Concatenate all context documents into a single string
-    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
-    # Format the prompt for the LLM
-    prompt_str = prompt.format(question=state["question"], context=docs_content)
-    # Generate answer using Hugging Face pipeline
-    response = llm_pipe(prompt_str)
-    # Extract generated text
-    answer = response[0]["generated_text"]
-    return {"answer": answer}
 # Create the StateGraph
 graph = StateGraph(State)
@@ -568,7 +663,7 @@ graph.add_node("audio", node_audio_rag)
 graph.add_edge(START, "decide")
 graph.add_edge("retrieve", "generate")
-# Add conditional edges from decide to video or llm based on question
 graph.add_conditional_edges(
     "decide",
     get_next_node,
@@ -581,7 +676,7 @@ graph.add_conditional_edges(
     }
 )
-# Add edges from video and llm to END to terminate the graph
 graph.add_edge("video", END)
 graph.add_edge("llm", END)
 graph.add_edge("generate", END)
@@ -591,14 +686,33 @@ graph.add_edge("audio", END)
 # Compile the graph
 agent = graph.compile()
-# --- Usage Example ---
 def intelligent_agent(state: State) -> str:
     """Process a question using the appropriate pipeline based on content."""
-    #state = State(question= question)
     try:
         final_state = agent.invoke(state)
-        return final_state.get('answer', "No answer found.")
     except Exception as e:
         print(f"Error in agent execution: {str(e)}")
-        return f"An error occurred: {str(e)}"

 from transformers import BlipProcessor, BlipForQuestionAnswering, pipeline
 from io import BytesIO
 from sentence_transformers import SentenceTransformer
+from transformers import RagRetriever, RagTokenizer, RagSequenceForGeneration
 import os
 class State(TypedDict, total=False):
     question: str
     task_id: str
+    input_file: Optional[bytes]
+    file_type: Optional[str]
     context: List[Document]  # Using LangChain's Document class
     file_path: Optional[str]
     youtube_url: Optional[str]
     next: Optional[str]  # Added to track the next node
 # --- LLM pipeline for general questions ---
+llm_pipe = pipeline(
+    "text-generation",
+    model="microsoft/Phi-3-mini-4k-instruct",
+    device_map=0,
+    torch_dtype="auto",
+    max_new_tokens=256
+)
+# Initialize RAG components
+tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
+retriever = RagRetriever.from_pretrained(
+    "facebook/rag-token-base",
+    index_name="exact",           # or "legacy" for legacy FAISS index
+    use_dummy_dataset=False,        # set to False and download the full index for real Wikipedia retrieval
+    trust_remote_code=True
+)
+rag_model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-base", retriever=retriever)
 # Speech-to-text pipeline
 asr_pipe = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-small",
+    device=0
 )
+# --- BLIP VQA setup ---
+device = "cuda" if torch.cuda.is_available() else "cpu"
 vqa_model_name = "Salesforce/blip-vqa-base"
 processor_vqa = BlipProcessor.from_pretrained(vqa_model_name)
     device = "cpu"  # Switch device to CPU
     model_vqa = BlipForQuestionAnswering.from_pretrained(vqa_model_name).to(device)
+# --- Helper functions ---
+def ensure_final_answer_format(answer_text: str) -> str:
+    """Ensure the answer ends with FINAL ANSWER: format"""
+    # Check if the answer already contains a FINAL ANSWER section
+    if "FINAL ANSWER:" in answer_text:
+        # Extract everything after FINAL ANSWER:
+        final_answer_part = answer_text.split("FINAL ANSWER:", 1)[1].strip()
+        return f"FINAL ANSWER: {final_answer_part}"
+    else:
+        # If no FINAL ANSWER section exists, wrap the entire answer
+        return f"FINAL ANSWER: {answer_text.strip()}"
+def extract_entities(text: str) -> List[str]:
+    """Extract key entities from text using spaCy if available, or regex fallback"""
+    if nlp:
+        # Using spaCy for better entity extraction
+        doc = nlp(text)
+        entities = [ent.text for ent in doc.ents]
+        keywords = [token.text for token in doc if token.pos_ in ("PROPN", "NOUN")]
+        return entities if entities else keywords
+    else:
+        # Simple fallback using regex to extract potential keywords
+        words = text.lower().split()
+        stopwords = ["what", "who", "when", "where", "why", "how", "is", "are", "the", "a", "an", "of", "in", "on", "at"]
+        keywords = [word for word in words if word not in stopwords and len(word) > 2]
+        return keywords
 def answer_question_on_frame(image_path, question):
+    """Answer a question about a single video frame using BLIP"""
+    try:
+        image = Image.open(image_path).convert('RGB')
+        inputs = processor_vqa(image, question, return_tensors="pt").to(device)
+        out = model_vqa.generate(**inputs)
+        answer = processor_vqa.decode(out[0], skip_special_tokens=True)
+        return answer
+    except Exception as e:
+        print(f"Error processing frame {image_path}: {str(e)}")
+        return "Error processing this frame"
 def answer_video_question(frames_dir, question):
+    """Answer a question about a video by analyzing extracted frames"""
     valid_exts = ('.jpg', '.jpeg', '.png')
     # Check if directory exists
         "answer_counts": counted
     }
+def download_youtube_video(url, output_dir='/tmp/video/', output_filename='downloaded_video.mp4'):
+    """Download a YouTube video using yt-dlp"""
     # Ensure the output directory exists
     os.makedirs(output_dir, exist_ok=True)
     # Set output path for yt-dlp
     output_path = os.path.join(output_dir, output_filename)
+    try:
+        ydl_opts = {
+            'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
+            'outtmpl': output_path,
+            'quiet': True,
+            'merge_output_format': 'mp4',  # Ensures merged output is mp4
+            'postprocessors': [{
+                'key': 'FFmpegVideoConvertor',
+                'preferedformat': 'mp4',  # Recode if needed
+            }]
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+        return output_path
+    except Exception as e:
+        print(f"Error downloading YouTube video: {str(e)}")
+        return None
 def extract_frames(video_path, output_dir, frame_interval_seconds=10):
+    """Extract frames from a video file at specified intervals"""
+    # Clean output directory before extracting new frames
     if os.path.exists(output_dir):
         for filename in os.listdir(output_dir):
             file_path = os.path.join(output_dir, filename)
         print(f"Exception during frame extraction: {e}")
         return False
+def image_qa(image_path: str, question: str) -> str:
+    """Answer questions about an image using the BLIP model"""
+    try:
+        image = Image.open(image_path).convert('RGB')
+        inputs = processor_vqa(image, question, return_tensors="pt").to(device)
+        out = model_vqa.generate(**inputs)
+        answer = processor_vqa.decode(out[0], skip_special_tokens=True)
+        return answer
+    except Exception as e:
+        print(f"Error in image_qa: {str(e)}")
+        return f"Error processing image: {str(e)}"
+# --- Node functions ---
 def router(state: Dict[str, Any]) -> str:
+    """Determine the next node based on question content and file type"""
     question = state.get('question', '')
     # Pattern for Wikipedia and similar sources
     wiki_pattern = r"(wikipedia\.org|wiki|encyclopedia|britannica\.com|encyclop[a|æ]dia)"
     has_wiki = re.search(wiki_pattern, question, re.IGNORECASE) is not None
     else:
         return "llm"
 def node_decide(state: Dict[str, Any]) -> Dict[str, Any]:
+    """Router node that decides which node to go to next"""
     print("Running node_decide")
+    # Initialize context list if not present
+    if 'context' not in state:
+        state['context'] = []
     # Add the next state to the state dict
     state["next"] = router(state)
     print(f"Routing to: {state['next']}")
     return state
+def node_image(state: Dict[str, Any]) -> Dict[str, Any]:
+    """Process image-based questions"""
+    print("Running node_image")
+    try:
+        # Make sure the image file exists
+        if not os.path.exists(state['file_path']):
+            state['answer'] = ensure_final_answer_format("Image file not found.")
+            return state
+        # Get answer from image QA model
+        answer = image_qa(state['file_path'], state['question'])
+        # Format the final answer
+        state['answer'] = ensure_final_answer_format(answer)
+        # Add document to state for traceability
+        image_doc = Document(
+            page_content=f"Image analysis result: {answer}",
+            metadata={"source": "image_analysis", "file_path": state['file_path']}
+        )
+        state['context'].append(image_doc)
+    except Exception as e:
+        error_msg = f"Error processing image: {str(e)}"
+        print(error_msg)
+        state['answer'] = ensure_final_answer_format(error_msg)
+    return state
 def node_video(state: Dict[str, Any]) -> Dict[str, Any]:
+    """Process video-based questions"""
     print("Running node_video")
     youtube_url = state.get('youtube_url')
     if not youtube_url:
+        state['answer'] = ensure_final_answer_format("No YouTube URL found in the question.")
         return state
     question = state['question']
     video_file = download_youtube_video(youtube_url)
     if not video_file or not os.path.exists(video_file):
+        state['answer'] = ensure_final_answer_format("Failed to download the video.")
         return state
     frames_dir = "/tmp/frames"
     success = extract_frames(video_path=video_file, output_dir=frames_dir, frame_interval_seconds=10)
     if not success:
+        state['answer'] = ensure_final_answer_format("Failed to extract frames from the video.")
         return state
     result = answer_video_question(frames_dir, question_text)
+    final_answer = result['most_common_answer']
     state['frame_answers'] = result['all_answers']
     # Create Document objects for each frame analysis
         )
         frame_documents.append(doc)
+    # Add documents to state
     state['context'].extend(frame_documents)
+    state['answer'] = ensure_final_answer_format(final_answer)
     print(f"Video answer: {state['answer']}")
     return state
 def node_audio_rag(state: Dict[str, Any]) -> Dict[str, Any]:
+    """Process audio-based questions"""
     print(f"Processing audio file: {state['file_path']}")
     try:
         audio_transcript = asr_result['text']
         print(f"Audio transcript: {audio_transcript}")
+        # Step 2: Store transcript in vector store
         transcript_doc = [Document(page_content=audio_transcript)]
         embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-large-en-v1.5')
         vector_db = FAISS.from_documents(transcript_doc, embedding=embeddings)
         # Step 3: Retrieve relevant docs for the user's question
         question = state['question']
+        similar_docs = vector_db.similarity_search(question, k=1)
         retrieved_context = "\n".join([doc.page_content for doc in similar_docs])
+        # Step 4: Generate answer
         prompt = (
+            f"You are an AI assistant that answers questions about audio content.\n\n"
+            f"Audio transcript: {retrieved_context}\n\n"
+            f"Question: {question}\n\n"
+            f"Based only on the provided audio transcript, answer the question. "
+            f"If the transcript does not contain relevant information, state that clearly.\n\n"
+            f"End your response with 'FINAL ANSWER: ' followed by a concise answer."
         )
         llm_response = llm_pipe(prompt)
+        answer_text = llm_response[0]['generated_text']
+        # Add documents to state
+        state['context'].extend(transcript_doc)
+        state['context'].append(Document(
+            page_content=prompt,
+            metadata={"source": "audio_analysis_prompt"}
+        ))
+        # Ensure final answer format
+        state['answer'] = ensure_final_answer_format(answer_text)
     except Exception as e:
         error_msg = f"Audio processing error: {str(e)}"
         print(error_msg)
+        state['answer'] = ensure_final_answer_format(error_msg)
     return state
 def node_llm(state: Dict[str, Any]) -> Dict[str, Any]:
+    """Process general knowledge questions with LLM"""
     print("Running node_llm")
     question = state['question']
     # Compose a detailed prompt
     prompt = (
+        "You are an AI assistant that answers questions using your general knowledge. "
+        "Follow these steps:\n\n"
+        "1. If the question appears to be scrambled or jumbled, first try to unscramble or reconstruct the intended meaning.\n"
+        "2. Analyze the question (unscrambled if needed) and use your own knowledge to answer it.\n"
+        "3. If the question can't be answered with certainty, provide your best estimate and clearly explain any assumptions.\n"
+        "4. Format your answer using these rules:\n"
+        "   - Numbers: Plain digits without commas/units (e.g. 1234567)\n"
+        "   - Strings: Minimal words, no articles/abbreviations\n"
+        "   - Lists: comma-separated values without extra formatting\n\n"
+        "5. Always conclude with:\n"
+        "FINAL ANSWER: [your answer] (replace bracketed text)\n\n"
+        f"Current question: {question}"
     )
     # Add document to state for traceability
         page_content=prompt,
         metadata={"source": "llm_prompt"}
     )
     state['context'].append(query_doc)
     try:
         result = llm_pipe(prompt)
+        answer_text = result[0]['generated_text']
+        state['answer'] = ensure_final_answer_format(answer_text)
     except Exception as e:
         print(f"Error in LLM processing: {str(e)}")
+        error_msg = f"An error occurred while processing your question: {str(e)}"
+        state['answer'] = ensure_final_answer_format(error_msg)
     print(f"LLM answer: {state['answer']}")
     return state
+def retrieve(state: State) -> State:
+    """Retrieve relevant documents using RAG"""
+    print("Running retrieve")
+    question = state["question"]
+    try:
+        # Tokenize the question
+        inputs = tokenizer(question, return_tensors="pt")
+        # Get doc_ids by using the retriever directly
+        question_hidden_states = rag_model.question_encoder(inputs["input_ids"])[0]
+        docs_dict = retriever(
+            inputs["input_ids"].numpy(),
+            question_hidden_states.detach().numpy(),
+            return_tensors="pt"
+        )
+        # Extract the retrieved passages
+        all_chunks = []
+        # Debug print to see what's in docs_dict
+        print(f"docs_dict keys: {docs_dict.keys()}")
+        # Check for different possible keys that might contain the documents
+        doc_text_key = None
+        for possible_key in ['retrieved_doc_text', 'doc_text', 'texts', 'documents']:
+            if possible_key in docs_dict:
+                doc_text_key = possible_key
+                break
+        if doc_text_key:
+            # Access the retrieved document texts from the docs_dict
+            for i in range(len(docs_dict["doc_ids"][0])):
+                doc_text = docs_dict[doc_text_key][0][i]
+                all_chunks.append(Document(page_content=doc_text))
+            print(f"Retrieved {len(all_chunks)} documents")
+        else:
+            # Fallback: Try to extract document text from doc_ids
+            doc_ids = docs_dict.get("doc_ids", [[]])[0]
+            print(f"Retrieved doc_ids: {doc_ids}")
+            # Create minimal document stubs from IDs
+            for doc_id in doc_ids:
+                stub_text = f"Information related to document ID: {doc_id}"
+                all_chunks.append(Document(page_content=stub_text))
+            print(f"Created {len(all_chunks)} document stubs from IDs")
+        # Add documents to state context
+        if not state.get('context'):
+            state['context'] = []
+        state['context'].extend(all_chunks)
+    except Exception as e:
+        print(f"Error in retrieval: {str(e)}")
+        # Create an error document
+        error_doc = Document(
+            page_content=f"Error during retrieval: {str(e)}",
+            metadata={"source": "retrieval_error"}
+        )
+        if not state.get('context'):
+            state['context'] = []
+        state['context'].append(error_doc)
+    return state
+def generate(state: State) -> State:
+    """Generate an answer based on retrieved documents"""
+    print("Running generate")
+    try:
+        # Check if context exists
+        if not state.get('context') or len(state['context']) == 0:
+            state['answer'] = ensure_final_answer_format("No relevant information found to answer your question.")
+            return state
+        # Concatenate all context documents into a single string
+        docs_content = "\n\n".join(doc.page_content for doc in state["context"])
+        # Format the prompt for the LLM
+        prompt_str = PromptTemplate(
+            input_variables=["question", "context"],
+            template=(
+                "You are an AI assistant that answers questions using retrieved context. "
+                "Follow these steps:\n\n"
+                "1. Analyze the provided context:\n{context}\n\n"
+                "2. If the context contains scrambled text, first attempt to reconstruct meaningful information\n"
+                "3. If the question can't be answered from context alone, combine context with general knowledge "
+                "but clearly state this limitation\n"
+                "4. Format your answer using these rules:\n"
+                "   - Numbers: Plain digits without commas/units (e.g. 1234567)\n"
+                "   - Strings: Minimal words, no articles/abbreviations\n"
+                "   - Lists: comma-separated values without extra formatting\n\n"
+                "5. Always conclude with:\n"
+                "FINAL ANSWER: [your answer] (replace bracketed text)\n\n"
+                "Current question: {question}"
+            )
+        ).format(question=state["question"], context=docs_content)
+        # Generate answer using the LLM pipeline
+        response = llm_pipe(prompt_str)
+        answer_text = response[0]["generated_text"]
+        # Ensure answer has the FINAL ANSWER format
+        state['answer'] = ensure_final_answer_format(answer_text)
+    except Exception as e:
+        print(f"Error in generate node: {str(e)}")
+        error_msg = f"Error generating answer: {str(e)}"
+        state['answer'] = ensure_final_answer_format(error_msg)
+    return state
 # --- Define the edge condition function ---
 def get_next_node(state: Dict[str, Any]) -> str:
+    """Get the next node from the state"""
     return state["next"]
 # Create the StateGraph
 graph = StateGraph(State)
 graph.add_edge(START, "decide")
 graph.add_edge("retrieve", "generate")
+# Add conditional edges from decide to other nodes based on question
 graph.add_conditional_edges(
     "decide",
     get_next_node,
     }
 )
+# Add edges from all terminal nodes to END
 graph.add_edge("video", END)
 graph.add_edge("llm", END)
 graph.add_edge("generate", END)
 # Compile the graph
 agent = graph.compile()
+# --- Intelligent Agent Function ---
 def intelligent_agent(state: State) -> str:
     """Process a question using the appropriate pipeline based on content."""
     try:
+        # Ensure state has proper structure
+        if not isinstance(state, dict):
+            return "FINAL ANSWER: Error - input must be a valid State dictionary"
+        # Make sure question exists
+        if 'question' not in state:
+            return "FINAL ANSWER: Error - question is required"
+        # Initialize context if not present
+        if 'context' not in state:
+            state['context'] = []
+        print(f"Processing question: {state['question']}")
+        # Invoke the agent with the state
         final_state = agent.invoke(state)
+        # Ensure answer has FINAL ANSWER format
+        answer = final_state.get('answer', "No answer found.")
+        formatted_answer = ensure_final_answer_format(answer)
+        return formatted_answer
     except Exception as e:
         print(f"Error in agent execution: {str(e)}")
+        return f"FINAL ANSWER: An error occurred - {str(e)}"