Spaces:
Sleeping
Sleeping
Nagesh Muralidhar
commited on
Commit
·
f6b7a05
1
Parent(s):
32402f2
Focus
Browse files- server/__pycache__/agents.cpython-311.pyc +0 -0
- server/__pycache__/main.cpython-311.pyc +0 -0
- server/__pycache__/workflow.cpython-311.pyc +0 -0
- server/logs/agents.log +0 -0
- server/main.py +154 -47
- server/transcripts/podcasts.json +0 -1
- server/utils.py +15 -4
server/__pycache__/agents.cpython-311.pyc
CHANGED
|
Binary files a/server/__pycache__/agents.cpython-311.pyc and b/server/__pycache__/agents.cpython-311.pyc differ
|
|
|
server/__pycache__/main.cpython-311.pyc
CHANGED
|
Binary files a/server/__pycache__/main.cpython-311.pyc and b/server/__pycache__/main.cpython-311.pyc differ
|
|
|
server/__pycache__/workflow.cpython-311.pyc
CHANGED
|
Binary files a/server/__pycache__/workflow.cpython-311.pyc and b/server/__pycache__/workflow.cpython-311.pyc differ
|
|
|
server/logs/agents.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
server/main.py
CHANGED
|
@@ -338,6 +338,27 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
|
|
| 338 |
"""Handle chat messages for a specific podcast."""
|
| 339 |
try:
|
| 340 |
logger.info(f"Processing chat message for podcast {podcast_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
# Path to transcripts file
|
| 343 |
transcripts_file = os.path.join(os.path.dirname(__file__), "transcripts", "podcasts.json")
|
|
@@ -352,50 +373,74 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
|
|
| 352 |
with open(transcripts_file, 'r') as f:
|
| 353 |
transcripts = json.load(f)
|
| 354 |
logger.info(f"Loaded {len(transcripts)} transcripts")
|
|
|
|
| 355 |
except json.JSONDecodeError as e:
|
| 356 |
logger.error(f"Error decoding transcripts file: {str(e)}")
|
| 357 |
raise HTTPException(status_code=500, detail="Error reading transcripts file")
|
| 358 |
-
|
| 359 |
-
#
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
logger.info(f"Found transcript for podcast {podcast_id}")
|
| 377 |
-
logger.debug(f"Transcript content: {podcast_transcript[:200]}...") # Log first 200 chars
|
| 378 |
-
except (IndexError, KeyError) as e:
|
| 379 |
-
logger.error(f"Error accessing podcast transcript: {str(e)}")
|
| 380 |
-
raise HTTPException(status_code=404, detail="Transcript not found for this podcast")
|
| 381 |
|
| 382 |
# Split text into chunks
|
| 383 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 384 |
-
chunk_size=
|
| 385 |
-
chunk_overlap=
|
| 386 |
length_function=len,
|
|
|
|
| 387 |
)
|
| 388 |
|
| 389 |
# Use split_text for strings instead of split_documents
|
| 390 |
-
|
| 391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
|
| 393 |
if not chunks:
|
| 394 |
logger.error("No content chunks found in transcript")
|
| 395 |
raise HTTPException(status_code=404, detail="No content chunks found in transcript")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
|
| 397 |
# Initialize embedding model
|
| 398 |
-
embedding_model = OpenAIEmbeddings(
|
|
|
|
|
|
|
|
|
|
| 399 |
|
| 400 |
# Create a unique collection name for this podcast
|
| 401 |
collection_name = f"podcast_{podcast_id}"
|
|
@@ -411,40 +456,94 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
|
|
| 411 |
|
| 412 |
# Configure the retriever with search parameters
|
| 413 |
qdrant_retriever = vectorstore.as_retriever(
|
| 414 |
-
search_type="similarity",
|
| 415 |
-
search_kwargs={
|
|
|
|
|
|
|
|
|
|
| 416 |
)
|
| 417 |
|
| 418 |
base_rag_prompt_template = """\
|
| 419 |
You are a helpful podcast assistant. Answer the user's question based on the provided context from the podcast transcript.
|
| 420 |
-
If
|
|
|
|
| 421 |
Keep your responses concise and focused on the question.
|
|
|
|
|
|
|
| 422 |
|
| 423 |
Context:
|
| 424 |
{context}
|
| 425 |
|
| 426 |
Question:
|
| 427 |
{question}
|
|
|
|
|
|
|
| 428 |
"""
|
| 429 |
|
| 430 |
base_rag_prompt = ChatPromptTemplate.from_template(base_rag_prompt_template)
|
| 431 |
-
base_llm = ChatOpenAI(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
|
| 433 |
# Create the RAG chain
|
| 434 |
def format_docs(docs):
|
| 435 |
-
|
|
|
|
|
|
|
| 436 |
|
| 437 |
# Add logging for the retrieved documents and final prompt
|
| 438 |
def get_context_and_log(input_dict):
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
# Create the chain
|
| 450 |
chain = (
|
|
@@ -454,11 +553,19 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
|
|
| 454 |
| base_llm
|
| 455 |
)
|
| 456 |
|
| 457 |
-
# Get response
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
|
| 463 |
except HTTPException:
|
| 464 |
raise
|
|
|
|
| 338 |
"""Handle chat messages for a specific podcast."""
|
| 339 |
try:
|
| 340 |
logger.info(f"Processing chat message for podcast {podcast_id}")
|
| 341 |
+
logger.info(f"User message: {request.message}")
|
| 342 |
+
|
| 343 |
+
# Get list of audio files
|
| 344 |
+
audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.mp3')]
|
| 345 |
+
logger.info(f"Found {len(audio_files)} audio files: {audio_files}")
|
| 346 |
+
|
| 347 |
+
# Convert podcast_id to zero-based index and get the filename
|
| 348 |
+
try:
|
| 349 |
+
podcast_index = int(podcast_id) - 1
|
| 350 |
+
if podcast_index < 0 or podcast_index >= len(audio_files):
|
| 351 |
+
logger.error(f"Invalid podcast index: {podcast_index} (total files: {len(audio_files)})")
|
| 352 |
+
raise ValueError(f"Invalid podcast ID: {podcast_id}")
|
| 353 |
+
podcast_filename = audio_files[podcast_index]
|
| 354 |
+
logger.info(f"Found podcast file: {podcast_filename}")
|
| 355 |
+
except ValueError as e:
|
| 356 |
+
logger.error(f"Error converting podcast ID: {str(e)}")
|
| 357 |
+
raise HTTPException(status_code=404, detail=str(e))
|
| 358 |
+
|
| 359 |
+
# Extract topic from filename
|
| 360 |
+
topic = podcast_filename.split('-')[0].replace('_', ' ')
|
| 361 |
+
logger.info(f"Extracted topic: {topic}")
|
| 362 |
|
| 363 |
# Path to transcripts file
|
| 364 |
transcripts_file = os.path.join(os.path.dirname(__file__), "transcripts", "podcasts.json")
|
|
|
|
| 373 |
with open(transcripts_file, 'r') as f:
|
| 374 |
transcripts = json.load(f)
|
| 375 |
logger.info(f"Loaded {len(transcripts)} transcripts")
|
| 376 |
+
logger.info(f"Available topics: {[t.get('topic', 'NO_TOPIC') for t in transcripts]}")
|
| 377 |
except json.JSONDecodeError as e:
|
| 378 |
logger.error(f"Error decoding transcripts file: {str(e)}")
|
| 379 |
raise HTTPException(status_code=500, detail="Error reading transcripts file")
|
| 380 |
+
|
| 381 |
+
# Find matching transcript by topic
|
| 382 |
+
podcast_transcript = None
|
| 383 |
+
for transcript in transcripts:
|
| 384 |
+
transcript_topic = transcript.get("topic", "").lower().strip()
|
| 385 |
+
if transcript_topic == topic.lower().strip():
|
| 386 |
+
podcast_transcript = transcript.get("podcastScript")
|
| 387 |
+
logger.info(f"Found matching transcript for topic: {topic}")
|
| 388 |
+
break
|
| 389 |
+
|
| 390 |
+
if not podcast_transcript:
|
| 391 |
+
logger.error(f"No transcript found for topic: {topic}")
|
| 392 |
+
logger.error(f"Available topics: {[t.get('topic', 'NO_TOPIC') for t in transcripts]}")
|
| 393 |
+
raise HTTPException(status_code=404, detail=f"No transcript found for topic: {topic}")
|
| 394 |
+
|
| 395 |
+
logger.info(f"Found transcript for topic: {topic}")
|
| 396 |
+
logger.info(f"Full transcript length: {len(podcast_transcript)} characters")
|
| 397 |
+
logger.debug(f"Transcript preview: {podcast_transcript[:200]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
|
| 399 |
# Split text into chunks
|
| 400 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 401 |
+
chunk_size=1000,
|
| 402 |
+
chunk_overlap=100,
|
| 403 |
length_function=len,
|
| 404 |
+
separators=["\n\n", "\n", ". ", " ", ""]
|
| 405 |
)
|
| 406 |
|
| 407 |
# Use split_text for strings instead of split_documents
|
| 408 |
+
try:
|
| 409 |
+
logger.info("Starting text splitting process...")
|
| 410 |
+
chunks = text_splitter.split_text(podcast_transcript)
|
| 411 |
+
logger.info(f"Successfully split transcript into {len(chunks)} chunks")
|
| 412 |
+
|
| 413 |
+
# Log some sample chunks
|
| 414 |
+
logger.info("\nSample chunks:")
|
| 415 |
+
for i, chunk in enumerate(chunks[:3]): # Log first 3 chunks
|
| 416 |
+
logger.info(f"\nChunk {i+1}:")
|
| 417 |
+
logger.info("=" * 50)
|
| 418 |
+
logger.info(chunk)
|
| 419 |
+
logger.info("=" * 50)
|
| 420 |
+
|
| 421 |
+
if len(chunks) > 3:
|
| 422 |
+
logger.info(f"... and {len(chunks) - 3} more chunks")
|
| 423 |
+
|
| 424 |
+
except Exception as e:
|
| 425 |
+
logger.error(f"Error splitting text into chunks: {str(e)}")
|
| 426 |
+
raise HTTPException(status_code=500, detail=f"Error splitting text: {str(e)}")
|
| 427 |
|
| 428 |
if not chunks:
|
| 429 |
logger.error("No content chunks found in transcript")
|
| 430 |
raise HTTPException(status_code=404, detail="No content chunks found in transcript")
|
| 431 |
+
|
| 432 |
+
# Validate chunk sizes
|
| 433 |
+
chunk_sizes = [len(chunk) for chunk in chunks]
|
| 434 |
+
logger.info(f"\nChunk size statistics:")
|
| 435 |
+
logger.info(f"Min chunk size: {min(chunk_sizes)} characters")
|
| 436 |
+
logger.info(f"Max chunk size: {max(chunk_sizes)} characters")
|
| 437 |
+
logger.info(f"Average chunk size: {sum(chunk_sizes)/len(chunk_sizes):.2f} characters")
|
| 438 |
|
| 439 |
# Initialize embedding model
|
| 440 |
+
embedding_model = OpenAIEmbeddings(
|
| 441 |
+
model="text-embedding-3-small",
|
| 442 |
+
openai_api_key=openai_api_key
|
| 443 |
+
)
|
| 444 |
|
| 445 |
# Create a unique collection name for this podcast
|
| 446 |
collection_name = f"podcast_{podcast_id}"
|
|
|
|
| 456 |
|
| 457 |
# Configure the retriever with search parameters
|
| 458 |
qdrant_retriever = vectorstore.as_retriever(
|
| 459 |
+
search_type="similarity", # Use simple similarity search
|
| 460 |
+
search_kwargs={
|
| 461 |
+
"k": 8, # Increased from 5 to 8 chunks
|
| 462 |
+
"score_threshold": 0.05 # Lowered threshold further for more matches
|
| 463 |
+
}
|
| 464 |
)
|
| 465 |
|
| 466 |
base_rag_prompt_template = """\
|
| 467 |
You are a helpful podcast assistant. Answer the user's question based on the provided context from the podcast transcript.
|
| 468 |
+
If the context contains relevant information, use it to answer the question.
|
| 469 |
+
If you can't find relevant information in the context to answer the question, say "I don't have enough information to answer that question."
|
| 470 |
Keep your responses concise and focused on the question.
|
| 471 |
+
|
| 472 |
+
Important: Even if only part of the context is relevant to the question, use that part to provide a partial answer rather than saying there isn't enough information.
|
| 473 |
|
| 474 |
Context:
|
| 475 |
{context}
|
| 476 |
|
| 477 |
Question:
|
| 478 |
{question}
|
| 479 |
+
|
| 480 |
+
Answer the question using the information from the context above. If you find ANY relevant information, use it to provide at least a partial answer. Only say "I don't have enough information" if there is absolutely nothing relevant in the context.
|
| 481 |
"""
|
| 482 |
|
| 483 |
base_rag_prompt = ChatPromptTemplate.from_template(base_rag_prompt_template)
|
| 484 |
+
base_llm = ChatOpenAI(
|
| 485 |
+
model="gpt-3.5-turbo",
|
| 486 |
+
temperature=0.7,
|
| 487 |
+
openai_api_key=openai_api_key
|
| 488 |
+
)
|
| 489 |
|
| 490 |
# Create the RAG chain
|
| 491 |
def format_docs(docs):
|
| 492 |
+
formatted = "\n\n".join(doc.page_content for doc in docs)
|
| 493 |
+
logger.info(f"Formatted {len(docs)} documents into context of length: {len(formatted)}")
|
| 494 |
+
return formatted
|
| 495 |
|
| 496 |
# Add logging for the retrieved documents and final prompt
|
| 497 |
def get_context_and_log(input_dict):
|
| 498 |
+
try:
|
| 499 |
+
logger.info("\nAttempting to retrieve relevant documents...")
|
| 500 |
+
# Log the query being used
|
| 501 |
+
logger.info(f"Query: {input_dict['question']}")
|
| 502 |
+
|
| 503 |
+
# Use the newer invoke method instead of get_relevant_documents
|
| 504 |
+
retrieved_docs = qdrant_retriever.invoke(input_dict["question"])
|
| 505 |
+
logger.info(f"Successfully retrieved {len(retrieved_docs)} documents")
|
| 506 |
+
|
| 507 |
+
if not retrieved_docs:
|
| 508 |
+
logger.warning("No documents were retrieved!")
|
| 509 |
+
return {"context": "No relevant context found.", "question": input_dict["question"]}
|
| 510 |
+
|
| 511 |
+
# Log each retrieved document with its content and similarity score
|
| 512 |
+
total_content_length = 0
|
| 513 |
+
for i, doc in enumerate(retrieved_docs):
|
| 514 |
+
logger.info(f"\nDocument {i+1}:")
|
| 515 |
+
logger.info("=" * 50)
|
| 516 |
+
logger.info(f"Content: {doc.page_content}")
|
| 517 |
+
logger.info(f"Content Length: {len(doc.page_content)} characters")
|
| 518 |
+
logger.info(f"Metadata: {doc.metadata}")
|
| 519 |
+
logger.info("=" * 50)
|
| 520 |
+
total_content_length += len(doc.page_content)
|
| 521 |
+
|
| 522 |
+
context = format_docs(retrieved_docs)
|
| 523 |
+
|
| 524 |
+
# Log the final formatted context and question
|
| 525 |
+
logger.info("\nRetrieval Statistics:")
|
| 526 |
+
logger.info(f"Total documents retrieved: {len(retrieved_docs)}")
|
| 527 |
+
logger.info(f"Total content length: {total_content_length} characters")
|
| 528 |
+
logger.info(f"Average document length: {total_content_length/len(retrieved_docs):.2f} characters")
|
| 529 |
+
|
| 530 |
+
logger.info("\nFinal Context and Question:")
|
| 531 |
+
logger.info("=" * 50)
|
| 532 |
+
logger.info("Context:")
|
| 533 |
+
logger.info(f"{context}")
|
| 534 |
+
logger.info("-" * 50)
|
| 535 |
+
logger.info(f"Question: {input_dict['question']}")
|
| 536 |
+
logger.info("=" * 50)
|
| 537 |
+
|
| 538 |
+
if not context.strip():
|
| 539 |
+
logger.error("Warning: Empty context retrieved!")
|
| 540 |
+
return {"context": "No relevant context found.", "question": input_dict["question"]}
|
| 541 |
+
|
| 542 |
+
return {"context": context, "question": input_dict["question"]}
|
| 543 |
+
except Exception as e:
|
| 544 |
+
logger.error(f"Error in get_context_and_log: {str(e)}")
|
| 545 |
+
logger.error("Stack trace:", exc_info=True)
|
| 546 |
+
return {"context": "Error retrieving context.", "question": input_dict["question"]}
|
| 547 |
|
| 548 |
# Create the chain
|
| 549 |
chain = (
|
|
|
|
| 553 |
| base_llm
|
| 554 |
)
|
| 555 |
|
| 556 |
+
# Get response with enhanced logging
|
| 557 |
+
try:
|
| 558 |
+
logger.info("\nGenerating response...")
|
| 559 |
+
response = chain.invoke({"question": request.message})
|
| 560 |
+
logger.info("=" * 50)
|
| 561 |
+
logger.info("Final Response:")
|
| 562 |
+
logger.info(f"{response.content}")
|
| 563 |
+
logger.info("=" * 50)
|
| 564 |
+
|
| 565 |
+
return PodcastChatResponse(response=response.content)
|
| 566 |
+
except Exception as e:
|
| 567 |
+
logger.error(f"Error generating response: {str(e)}")
|
| 568 |
+
raise HTTPException(status_code=500, detail=f"Error generating response: {str(e)}")
|
| 569 |
|
| 570 |
except HTTPException:
|
| 571 |
raise
|
server/transcripts/podcasts.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
[]
|
|
|
|
|
|
server/utils.py
CHANGED
|
@@ -13,11 +13,15 @@ TRANSCRIPTS_FILE = os.path.join(TRANSCRIPTS_DIR, "podcasts.json")
|
|
| 13 |
|
| 14 |
def save_transcript(podcast_script: str, user_query: str) -> None:
|
| 15 |
"""Save podcast transcript to JSON file."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
# Create new transcript entry
|
| 17 |
transcript = {
|
| 18 |
"id": str(uuid.uuid4()),
|
| 19 |
"podcastScript": podcast_script,
|
| 20 |
-
"topic":
|
| 21 |
}
|
| 22 |
|
| 23 |
try:
|
|
@@ -34,13 +38,20 @@ def save_transcript(podcast_script: str, user_query: str) -> None:
|
|
| 34 |
else:
|
| 35 |
transcripts = []
|
| 36 |
|
| 37 |
-
#
|
| 38 |
-
transcripts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
# Save updated transcripts
|
| 41 |
with open(TRANSCRIPTS_FILE, 'w') as f:
|
| 42 |
json.dump(transcripts, f, indent=2)
|
| 43 |
-
logger.info("Successfully saved transcript")
|
| 44 |
|
| 45 |
except Exception as e:
|
| 46 |
logger.error(f"Error saving transcript: {str(e)}")
|
|
|
|
| 13 |
|
| 14 |
def save_transcript(podcast_script: str, user_query: str) -> None:
|
| 15 |
"""Save podcast transcript to JSON file."""
|
| 16 |
+
# Process the topic to match filename format
|
| 17 |
+
topic = user_query.lower().strip().replace(" ", "_")
|
| 18 |
+
topic = topic.replace("?", "").replace("!", "").replace(".", "") # Remove punctuation
|
| 19 |
+
|
| 20 |
# Create new transcript entry
|
| 21 |
transcript = {
|
| 22 |
"id": str(uuid.uuid4()),
|
| 23 |
"podcastScript": podcast_script,
|
| 24 |
+
"topic": topic.replace("_", " ") # Store topic with spaces for matching
|
| 25 |
}
|
| 26 |
|
| 27 |
try:
|
|
|
|
| 38 |
else:
|
| 39 |
transcripts = []
|
| 40 |
|
| 41 |
+
# Check if transcript for this topic already exists
|
| 42 |
+
for i, existing in enumerate(transcripts):
|
| 43 |
+
if existing.get("topic") == transcript["topic"]:
|
| 44 |
+
# Update existing transcript
|
| 45 |
+
transcripts[i] = transcript
|
| 46 |
+
break
|
| 47 |
+
else:
|
| 48 |
+
# Append new transcript if no existing one was found
|
| 49 |
+
transcripts.append(transcript)
|
| 50 |
|
| 51 |
# Save updated transcripts
|
| 52 |
with open(TRANSCRIPTS_FILE, 'w') as f:
|
| 53 |
json.dump(transcripts, f, indent=2)
|
| 54 |
+
logger.info(f"Successfully saved transcript for topic: {transcript['topic']}")
|
| 55 |
|
| 56 |
except Exception as e:
|
| 57 |
logger.error(f"Error saving transcript: {str(e)}")
|