Spaces:

dataera2013
/

midterm

Sleeping

App Files Files Community

Nagesh Muralidhar commited on Feb 26, 2025

Commit

f6b7a05

1 Parent(s): 32402f2

Focus

Browse files

Files changed (7) hide show

server/__pycache__/agents.cpython-311.pyc +0 -0
server/__pycache__/main.cpython-311.pyc +0 -0
server/__pycache__/workflow.cpython-311.pyc +0 -0
server/logs/agents.log +0 -0
server/main.py +154 -47
server/transcripts/podcasts.json +0 -1
server/utils.py +15 -4

server/__pycache__/agents.cpython-311.pyc CHANGED Viewed

Binary files a/server/__pycache__/agents.cpython-311.pyc and b/server/__pycache__/agents.cpython-311.pyc differ

server/__pycache__/main.cpython-311.pyc CHANGED Viewed

Binary files a/server/__pycache__/main.cpython-311.pyc and b/server/__pycache__/main.cpython-311.pyc differ

server/__pycache__/workflow.cpython-311.pyc CHANGED Viewed

Binary files a/server/__pycache__/workflow.cpython-311.pyc and b/server/__pycache__/workflow.cpython-311.pyc differ

server/logs/agents.log ADDED Viewed

The diff for this file is too large to render. See raw diff

server/main.py CHANGED Viewed

@@ -338,6 +338,27 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
     """Handle chat messages for a specific podcast."""
     try:
         logger.info(f"Processing chat message for podcast {podcast_id}")
         # Path to transcripts file
         transcripts_file = os.path.join(os.path.dirname(__file__), "transcripts", "podcasts.json")
@@ -352,50 +373,74 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
             with open(transcripts_file, 'r') as f:
                 transcripts = json.load(f)
                 logger.info(f"Loaded {len(transcripts)} transcripts")
         except json.JSONDecodeError as e:
             logger.error(f"Error decoding transcripts file: {str(e)}")
             raise HTTPException(status_code=500, detail="Error reading transcripts file")
-        # Convert podcast_id to zero-based index
-        try:
-            podcast_index = int(podcast_id) - 1
-            if podcast_index < 0 or podcast_index >= len(transcripts):
-                logger.error(f"Invalid podcast index: {podcast_index} (total transcripts: {len(transcripts)})")
-                raise ValueError(f"Invalid podcast ID: {podcast_id}")
-        except ValueError as e:
-            logger.error(f"Error converting podcast ID: {str(e)}")
-            raise HTTPException(status_code=404, detail=str(e))
-        # Get podcast transcript
-        try:
-            podcast_transcript = transcripts[podcast_index].get("podcastScript")
-            if not podcast_transcript:
-                logger.error(f"No transcript content found for podcast {podcast_id}")
-                raise HTTPException(status_code=404, detail="No transcript content found for this podcast")
-            logger.info(f"Found transcript for podcast {podcast_id}")
-            logger.debug(f"Transcript content: {podcast_transcript[:200]}...")  # Log first 200 chars
-        except (IndexError, KeyError) as e:
-            logger.error(f"Error accessing podcast transcript: {str(e)}")
-            raise HTTPException(status_code=404, detail="Transcript not found for this podcast")
         # Split text into chunks
         text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=500,
-            chunk_overlap=50,
             length_function=len,
         )
         # Use split_text for strings instead of split_documents
-        chunks = text_splitter.split_text(podcast_transcript)
-        logger.info(f"Split transcript into {len(chunks)} chunks")
         if not chunks:
             logger.error("No content chunks found in transcript")
             raise HTTPException(status_code=404, detail="No content chunks found in transcript")
         # Initialize embedding model
-        embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
         # Create a unique collection name for this podcast
         collection_name = f"podcast_{podcast_id}"
@@ -411,40 +456,94 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
         # Configure the retriever with search parameters
         qdrant_retriever = vectorstore.as_retriever(
-            search_type="similarity",
-            search_kwargs={"k": 3}  # Get top 3 most relevant chunks
         )
         base_rag_prompt_template = """\
         You are a helpful podcast assistant. Answer the user's question based on the provided context from the podcast transcript.
-        If you can't find the answer in the context, just say "I don't have enough information to answer that question."
         Keep your responses concise and focused on the question.
         Context:
         {context}
         Question:
         {question}
         """
         base_rag_prompt = ChatPromptTemplate.from_template(base_rag_prompt_template)
-        base_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)
         # Create the RAG chain
         def format_docs(docs):
-            return "\n\n".join(doc.page_content for doc in docs)
         # Add logging for the retrieved documents and final prompt
         def get_context_and_log(input_dict):
-            retrieved_docs = qdrant_retriever.get_relevant_documents(input_dict["question"])
-            context = format_docs(retrieved_docs)
-            logger.info("Retrieved context from podcast:")
-            logger.info("-" * 50)
-            logger.info(f"Context:\n{context}")
-            logger.info("-" * 50)
-            logger.info(f"Question: {input_dict['question']}")
-            logger.info("-" * 50)
-            return {"context": context, "question": input_dict["question"]}
         # Create the chain
         chain = (
@@ -454,11 +553,19 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
             | base_llm
         )
-        # Get response
-        response = chain.invoke({"question": request.message})
-        logger.info(f"Generated response: {response.content}")
-        return PodcastChatResponse(response=response.content)
     except HTTPException:
         raise

     """Handle chat messages for a specific podcast."""
     try:
         logger.info(f"Processing chat message for podcast {podcast_id}")
+        logger.info(f"User message: {request.message}")
+        # Get list of audio files
+        audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.mp3')]
+        logger.info(f"Found {len(audio_files)} audio files: {audio_files}")
+        # Convert podcast_id to zero-based index and get the filename
+        try:
+            podcast_index = int(podcast_id) - 1
+            if podcast_index < 0 or podcast_index >= len(audio_files):
+                logger.error(f"Invalid podcast index: {podcast_index} (total files: {len(audio_files)})")
+                raise ValueError(f"Invalid podcast ID: {podcast_id}")
+            podcast_filename = audio_files[podcast_index]
+            logger.info(f"Found podcast file: {podcast_filename}")
+        except ValueError as e:
+            logger.error(f"Error converting podcast ID: {str(e)}")
+            raise HTTPException(status_code=404, detail=str(e))
+        # Extract topic from filename
+        topic = podcast_filename.split('-')[0].replace('_', ' ')
+        logger.info(f"Extracted topic: {topic}")
         # Path to transcripts file
         transcripts_file = os.path.join(os.path.dirname(__file__), "transcripts", "podcasts.json")
             with open(transcripts_file, 'r') as f:
                 transcripts = json.load(f)
                 logger.info(f"Loaded {len(transcripts)} transcripts")
+                logger.info(f"Available topics: {[t.get('topic', 'NO_TOPIC') for t in transcripts]}")
         except json.JSONDecodeError as e:
             logger.error(f"Error decoding transcripts file: {str(e)}")
             raise HTTPException(status_code=500, detail="Error reading transcripts file")
+        # Find matching transcript by topic
+        podcast_transcript = None
+        for transcript in transcripts:
+            transcript_topic = transcript.get("topic", "").lower().strip()
+            if transcript_topic == topic.lower().strip():
+                podcast_transcript = transcript.get("podcastScript")
+                logger.info(f"Found matching transcript for topic: {topic}")
+                break
+        if not podcast_transcript:
+            logger.error(f"No transcript found for topic: {topic}")
+            logger.error(f"Available topics: {[t.get('topic', 'NO_TOPIC') for t in transcripts]}")
+            raise HTTPException(status_code=404, detail=f"No transcript found for topic: {topic}")
+        logger.info(f"Found transcript for topic: {topic}")
+        logger.info(f"Full transcript length: {len(podcast_transcript)} characters")
+        logger.debug(f"Transcript preview: {podcast_transcript[:200]}...")
         # Split text into chunks
         text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=100,
             length_function=len,
+            separators=["\n\n", "\n", ". ", " ", ""]
         )
         # Use split_text for strings instead of split_documents
+        try:
+            logger.info("Starting text splitting process...")
+            chunks = text_splitter.split_text(podcast_transcript)
+            logger.info(f"Successfully split transcript into {len(chunks)} chunks")
+            # Log some sample chunks
+            logger.info("\nSample chunks:")
+            for i, chunk in enumerate(chunks[:3]):  # Log first 3 chunks
+                logger.info(f"\nChunk {i+1}:")
+                logger.info("=" * 50)
+                logger.info(chunk)
+                logger.info("=" * 50)
+            if len(chunks) > 3:
+                logger.info(f"... and {len(chunks) - 3} more chunks")
+        except Exception as e:
+            logger.error(f"Error splitting text into chunks: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Error splitting text: {str(e)}")
         if not chunks:
             logger.error("No content chunks found in transcript")
             raise HTTPException(status_code=404, detail="No content chunks found in transcript")
+        # Validate chunk sizes
+        chunk_sizes = [len(chunk) for chunk in chunks]
+        logger.info(f"\nChunk size statistics:")
+        logger.info(f"Min chunk size: {min(chunk_sizes)} characters")
+        logger.info(f"Max chunk size: {max(chunk_sizes)} characters")
+        logger.info(f"Average chunk size: {sum(chunk_sizes)/len(chunk_sizes):.2f} characters")
         # Initialize embedding model
+        embedding_model = OpenAIEmbeddings(
+            model="text-embedding-3-small",
+            openai_api_key=openai_api_key
+        )
         # Create a unique collection name for this podcast
         collection_name = f"podcast_{podcast_id}"
         # Configure the retriever with search parameters
         qdrant_retriever = vectorstore.as_retriever(
+            search_type="similarity",  # Use simple similarity search
+            search_kwargs={
+                "k": 8,  # Increased from 5 to 8 chunks
+                "score_threshold": 0.05  # Lowered threshold further for more matches
+            }
         )
         base_rag_prompt_template = """\
         You are a helpful podcast assistant. Answer the user's question based on the provided context from the podcast transcript.
+        If the context contains relevant information, use it to answer the question.
+        If you can't find relevant information in the context to answer the question, say "I don't have enough information to answer that question."
         Keep your responses concise and focused on the question.
+        Important: Even if only part of the context is relevant to the question, use that part to provide a partial answer rather than saying there isn't enough information.
         Context:
         {context}
         Question:
         {question}
+        Answer the question using the information from the context above. If you find ANY relevant information, use it to provide at least a partial answer. Only say "I don't have enough information" if there is absolutely nothing relevant in the context.
         """
         base_rag_prompt = ChatPromptTemplate.from_template(base_rag_prompt_template)
+        base_llm = ChatOpenAI(
+            model="gpt-3.5-turbo",
+            temperature=0.7,
+            openai_api_key=openai_api_key
+        )
         # Create the RAG chain
         def format_docs(docs):
+            formatted = "\n\n".join(doc.page_content for doc in docs)
+            logger.info(f"Formatted {len(docs)} documents into context of length: {len(formatted)}")
+            return formatted
         # Add logging for the retrieved documents and final prompt
         def get_context_and_log(input_dict):
+            try:
+                logger.info("\nAttempting to retrieve relevant documents...")
+                # Log the query being used
+                logger.info(f"Query: {input_dict['question']}")
+                # Use the newer invoke method instead of get_relevant_documents
+                retrieved_docs = qdrant_retriever.invoke(input_dict["question"])
+                logger.info(f"Successfully retrieved {len(retrieved_docs)} documents")
+                if not retrieved_docs:
+                    logger.warning("No documents were retrieved!")
+                    return {"context": "No relevant context found.", "question": input_dict["question"]}
+                # Log each retrieved document with its content and similarity score
+                total_content_length = 0
+                for i, doc in enumerate(retrieved_docs):
+                    logger.info(f"\nDocument {i+1}:")
+                    logger.info("=" * 50)
+                    logger.info(f"Content: {doc.page_content}")
+                    logger.info(f"Content Length: {len(doc.page_content)} characters")
+                    logger.info(f"Metadata: {doc.metadata}")
+                    logger.info("=" * 50)
+                    total_content_length += len(doc.page_content)
+                context = format_docs(retrieved_docs)
+                # Log the final formatted context and question
+                logger.info("\nRetrieval Statistics:")
+                logger.info(f"Total documents retrieved: {len(retrieved_docs)}")
+                logger.info(f"Total content length: {total_content_length} characters")
+                logger.info(f"Average document length: {total_content_length/len(retrieved_docs):.2f} characters")
+                logger.info("\nFinal Context and Question:")
+                logger.info("=" * 50)
+                logger.info("Context:")
+                logger.info(f"{context}")
+                logger.info("-" * 50)
+                logger.info(f"Question: {input_dict['question']}")
+                logger.info("=" * 50)
+                if not context.strip():
+                    logger.error("Warning: Empty context retrieved!")
+                    return {"context": "No relevant context found.", "question": input_dict["question"]}
+                return {"context": context, "question": input_dict["question"]}
+            except Exception as e:
+                logger.error(f"Error in get_context_and_log: {str(e)}")
+                logger.error("Stack trace:", exc_info=True)
+                return {"context": "Error retrieving context.", "question": input_dict["question"]}
         # Create the chain
         chain = (
             | base_llm
         )
+        # Get response with enhanced logging
+        try:
+            logger.info("\nGenerating response...")
+            response = chain.invoke({"question": request.message})
+            logger.info("=" * 50)
+            logger.info("Final Response:")
+            logger.info(f"{response.content}")
+            logger.info("=" * 50)
+            return PodcastChatResponse(response=response.content)
+        except Exception as e:
+            logger.error(f"Error generating response: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Error generating response: {str(e)}")
     except HTTPException:
         raise

server/transcripts/podcasts.json DELETED Viewed

	@@ -1 +0,0 @@
1	- []

server/utils.py CHANGED Viewed

@@ -13,11 +13,15 @@ TRANSCRIPTS_FILE = os.path.join(TRANSCRIPTS_DIR, "podcasts.json")
 def save_transcript(podcast_script: str, user_query: str) -> None:
     """Save podcast transcript to JSON file."""
     # Create new transcript entry
     transcript = {
         "id": str(uuid.uuid4()),
         "podcastScript": podcast_script,
-        "topic": user_query
     }
     try:
@@ -34,13 +38,20 @@ def save_transcript(podcast_script: str, user_query: str) -> None:
         else:
             transcripts = []
-        # Append new transcript
-        transcripts.append(transcript)
         # Save updated transcripts
         with open(TRANSCRIPTS_FILE, 'w') as f:
             json.dump(transcripts, f, indent=2)
-            logger.info("Successfully saved transcript")
     except Exception as e:
         logger.error(f"Error saving transcript: {str(e)}")

 def save_transcript(podcast_script: str, user_query: str) -> None:
     """Save podcast transcript to JSON file."""
+    # Process the topic to match filename format
+    topic = user_query.lower().strip().replace(" ", "_")
+    topic = topic.replace("?", "").replace("!", "").replace(".", "")  # Remove punctuation
     # Create new transcript entry
     transcript = {
         "id": str(uuid.uuid4()),
         "podcastScript": podcast_script,
+        "topic": topic.replace("_", " ")  # Store topic with spaces for matching
     }
     try:
         else:
             transcripts = []
+        # Check if transcript for this topic already exists
+        for i, existing in enumerate(transcripts):
+            if existing.get("topic") == transcript["topic"]:
+                # Update existing transcript
+                transcripts[i] = transcript
+                break
+        else:
+            # Append new transcript if no existing one was found
+            transcripts.append(transcript)
         # Save updated transcripts
         with open(TRANSCRIPTS_FILE, 'w') as f:
             json.dump(transcripts, f, indent=2)
+            logger.info(f"Successfully saved transcript for topic: {transcript['topic']}")
     except Exception as e:
         logger.error(f"Error saving transcript: {str(e)}")