ChienChung commited on
Commit
d3ae098
·
verified ·
1 Parent(s): 1c79352

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -334
app.py CHANGED
@@ -1232,401 +1232,270 @@ def multi_agent_chat_advanced(query: str, file=None) -> str:
1232
  except Exception as e:
1233
  return f"Multi-Agent Error: {e}"
1234
 
1235
- # Tab 6
1236
- # LangGraph node functions
1237
- # Initialize the embedding model
1238
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
1239
 
1240
- # Intent embedding classification (supports file names)
1241
  INTENT_LABELS = {
1242
- "DocQA": ["document", "file", "paper", "cb", "proposal", "project"],
1243
- "Summarise": ["summarise", "summary", "abstract", "key points", "overview", "main points"],
1244
- "General": ["who are you", "tell me something", "what can you do", "fun fact"],
1245
  }
1246
 
1247
- # AutoGen Multi-Agent Collaboration Logic
1248
  def detect_intent_embedding(query, file_names=[]):
 
 
 
 
1249
  query_emb = embedding_model.encode(query, normalize_embeddings=True)
1250
- best_label = None
1251
- best_score = -1
1252
- all_phrases = INTENT_LABELS.copy()
1253
  if file_names:
1254
- all_phrases["DocQA"] += [name.lower() for name in file_names]
1255
- for label, examples in all_phrases.items():
1256
- for example in examples:
1257
- example_emb = embedding_model.encode(example, normalize_embeddings=True)
1258
- score = float(query_emb @ example_emb.T)
 
1259
  if score > best_score:
1260
- best_score = score
1261
- best_label = label
1262
- return best_label if best_label else "General"
1263
-
1264
- def autogen_multi_document_analysis(query: str, docs: list, file_names: list) -> str:
1265
- try:
1266
- # Create a temporary working directory
1267
- temp_dir = tempfile.mkdtemp(dir="/tmp")
1268
- os.environ["OPENAI_CACHE_DIR"] = temp_dir
1269
-
1270
- # Set AutoGen's working directory
1271
- os.environ["AUTOGEN_CACHE_PATH"] = temp_dir
1272
- os.environ["AUTOGEN_CACHEDIR"] = temp_dir
1273
- os.environ["OPENAI_CACHE_PATH"] = temp_dir
1274
-
1275
- # Force AutoGen to use our temporary directory instead of ./.cache
1276
- if hasattr(autogen, "set_cache_dir"):
1277
- autogen.set_cache_dir(temp_dir)
1278
-
1279
- # Prepare document context
1280
- context = "\n\n".join(
1281
- f"Document {name}:\n{doc[:2000]}..."
1282
- for name, doc in zip(file_names, docs)
1283
- )
1284
-
1285
- # Configure LLM
1286
- config_list = [{
1287
- "model": "gpt-4o-mini",
1288
- "api_key": openai_api_key
1289
- }]
1290
-
1291
- # Base configuration (without any cache-related parameters)
1292
- llm_config = {
1293
- "config_list": config_list,
1294
- "temperature": 0
1295
- }
1296
-
1297
- # Switch to temporary directory before AutoGen processing
1298
- original_dir = os.getcwd()
1299
- os.chdir(temp_dir)
1300
-
1301
- try:
1302
- # AutoGen processing code
1303
- user_proxy = UserProxyAgent(
1304
- name="User",
1305
- system_message="A user seeking information from multiple documents.",
1306
- human_input_mode="NEVER",
1307
- code_execution_config={"use_docker": False},
1308
- llm_config=llm_config
1309
- )
1310
-
1311
- # Define document analysis expert
1312
- doc_analyzer = AssistantAgent(
1313
- name="DocumentAnalyzer",
1314
- system_message="""You are an expert at analyzing and comparing documents. Focus on:
1315
- 1. Key similarities and differences
1316
- 2. Main themes and topics
1317
- 3. Relationships between documents
1318
- 4. Evidence-based analysis""",
1319
- llm_config=llm_config
1320
- )
1321
-
1322
- # Define Q&A expert
1323
- qa_expert = AssistantAgent(
1324
- name="QAExpert",
1325
- system_message="""You are an expert at extracting specific information. Focus on:
1326
- 1. Finding relevant details
1327
- 2. Answering specific questions
1328
- 3. Cross-referencing information
1329
- 4. Providing evidence""",
1330
- llm_config=llm_config
1331
- )
1332
-
1333
- # Define summarisation expert
1334
- summarizer = AssistantAgent(
1335
- name="Summarizer",
1336
- system_message="""You are an expert at summarizing content. Focus on:
1337
- 1. Key points and findings
1338
- 2. Important relationships
1339
- 3. Critical conclusions
1340
- 4. Comprehensive overview""",
1341
- llm_config=llm_config
1342
- )
1343
-
1344
- # Create group chat
1345
- groupchat = GroupChat(
1346
- agents=[user_proxy, doc_analyzer, qa_expert, summarizer],
1347
- messages=[],
1348
- max_round=5
1349
- )
1350
 
1351
- # Create manager
1352
- manager = GroupChatManager(
1353
- groupchat=groupchat,
1354
- llm_config=llm_config
1355
- )
1356
-
1357
- # Prepare task prompt
1358
- task_prompt = f"""Analyze these documents and answer the query:
1359
-
1360
- Query: {query}
1361
-
1362
- Documents Context:
1363
- {context}
1364
-
1365
- Requirements:
1366
- 1. Provide a direct and clear answer
1367
- 2. Support all claims with evidence from the documents
1368
- 3. Consider relationships between all documents
1369
- 4. If comparing, analyze all relevant aspects
1370
- 5. If summarizing, cover all important points
1371
- 6. If looking for specific content, search thoroughly
1372
- 7. If analyzing relationships, consider all connections
1373
-
1374
- Please provide a comprehensive and well-structured answer."""
1375
-
1376
- # Execute the group discussion
1377
- user_proxy.initiate_chat(manager, message=task_prompt)
1378
- return user_proxy.last_message()["content"]
1379
- finally:
1380
- # After processing, change back to the original directory
1381
- os.chdir(original_dir)
1382
-
1383
- return result
1384
 
1385
- except Exception as e:
1386
- print(f"ERROR in AutoGen processing: {str(e)}")
1387
- return f"Error analyzing documents: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1388
 
 
 
 
 
 
 
1389
 
 
 
1390
 
1391
- def decide_next(state):
1392
- query = state.get("query", "")
1393
- file_names = state.get("file_names", [])
1394
- label = detect_intent_embedding(query, file_names)
1395
- return label
1396
 
 
 
1397
 
1398
- # === LangGraph Node Functions ===
 
 
 
 
 
 
1399
 
1400
  def general_run(state):
1401
- """Use direct LLM response instead of General Agent."""
1402
- try:
1403
- prompt = f"""You are a helpful AI assistant. Please answer the following question:
1404
- {state["query"]}
1405
-
1406
- Provide a clear and informative answer."""
1407
-
1408
- response = llm_gpt4.invoke(prompt)
1409
- answer = response.content if hasattr(response, 'content') else str(response)
1410
- return {"answer": answer}
1411
- except Exception as e:
1412
- print(f"ERROR in general_run: {str(e)}")
1413
- return {"answer": "I apologize, but I'm having trouble processing your request."}
1414
 
1415
  def docqa_run(state):
1416
- """Document Q&A processing."""
1417
- try:
1418
- # If a retriever exists, use it to get relevant documents; otherwise, use provided docs
1419
- if "retriever" in state:
1420
- relevant_docs = state["retriever"].get_relevant_documents(state["query"])
1421
- context = "\n".join(d.page_content for d in relevant_docs)
1422
- else:
1423
- context = "\n".join(state["docs"])
1424
-
1425
- prompt = f"""Based on the following context, please answer the question:
1426
- Question: {state["query"]}
1427
-
1428
- Context:
1429
- {context[:3000]}
1430
-
1431
- Provide a detailed and accurate answer based on the context."""
1432
-
1433
- response = llm_gpt4.invoke(prompt)
1434
- return {"answer": response.content if hasattr(response, 'content') else str(response)}
1435
- except Exception as e:
1436
- print(f"ERROR in docqa_run: {str(e)}")
1437
- return general_run(state)
1438
-
1439
- def summariser_run(state):
1440
- """Document summarisation processing."""
1441
- try:
1442
  context = "\n".join(state["docs"])
1443
- prompt = f"""Please provide a comprehensive summary of the following document:
1444
- {context[:3000]}
1445
-
1446
- Focus on:
1447
- 1. Main topics and key points
1448
- 2. Important findings or conclusions
1449
- 3. Significant details"""
1450
-
1451
- response = llm_gpt4.invoke(prompt)
1452
- return {"summary": response.content if hasattr(response, 'content') else str(response)}
1453
- except Exception as e:
1454
- print(f"ERROR in summariser_run: {str(e)}")
1455
- return {"summary": "Error generating summary."}
1456
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1457
 
1458
  def get_file_path_tab6(file):
1459
  if isinstance(file, str):
1460
- print("DEBUG: File is a string:", file)
1461
  if os.path.exists(file):
1462
- print("DEBUG: File exists:", file)
1463
  return file
1464
  else:
1465
- print("DEBUG: File does not exist:", file)
1466
  return None
1467
  elif isinstance(file, dict):
1468
- print("DEBUG: File is a dict:", file)
1469
  data = file.get("data")
1470
  name = file.get("name")
1471
- print("DEBUG: Data:", data, "Name:", name)
1472
  if data:
1473
  if isinstance(data, str) and os.path.exists(data):
1474
- print("DEBUG: Data is a valid file path:", data)
1475
  return data
1476
  else:
1477
  temp_dir = mkdtemp()
1478
  file_path = os.path.join(temp_dir, name if name else "uploaded_file")
1479
- print("DEBUG: Writing data to temporary file:", file_path)
1480
  with open(file_path, "wb") as f:
1481
  if isinstance(data, str):
1482
  f.write(data.encode("utf-8"))
1483
  else:
1484
  f.write(data)
1485
- if os.path.exists(file_path):
1486
- print("DEBUG: Temporary file created:", file_path)
1487
- return file_path
1488
- else:
1489
- print("ERROR: Temporary file not created:", file_path)
1490
- return None
1491
  else:
1492
- print("DEBUG: No data in dict, returning None")
1493
  return None
1494
  elif hasattr(file, "save"):
1495
- print("DEBUG: File has save attribute")
1496
  temp_dir = mkdtemp()
1497
  file_path = os.path.join(temp_dir, file.name)
1498
  file.save(file_path)
1499
- if os.path.exists(file_path):
1500
- print("DEBUG: File saved to:", file_path)
1501
- return file_path
1502
- else:
1503
- print("ERROR: File not saved properly:", file_path)
1504
- return None
1505
  else:
1506
- print("DEBUG: File type unrecognized")
1507
- if hasattr(file, "name"):
1508
- if os.path.exists(file.name):
1509
- return file.name
1510
  return None
1511
 
1512
- @traceable(name="multi_doc")
1513
  def langgraph_tab6_main(query: str, file=None):
 
 
 
 
 
 
 
 
 
 
1514
  try:
1515
- print(f"DEBUG: Starting processing with query: {query}")
1516
-
1517
- # If no file is uploaded, directly use general_run
1518
  if not file:
1519
  return general_run({"query": query})["answer"]
1520
-
1521
- # Process list of files
1522
  files = file if isinstance(file, list) else [file]
1523
- all_docs = []
1524
- file_names = []
1525
- docs_by_file = []
1526
-
1527
- # Process each uploaded file
1528
  for f in files:
1529
- try:
1530
- path = get_file_path_tab6(f)
1531
- if not path:
1532
- continue
1533
-
1534
- file_names.append(os.path.basename(path))
1535
-
1536
- # Choose loader based on file type
1537
- if path.lower().endswith('.pdf'):
1538
- loader = PyPDFLoader(path)
1539
- elif path.lower().endswith('.docx'):
1540
- loader = UnstructuredWordDocumentLoader(path)
1541
- else:
1542
- loader = TextLoader(path)
1543
-
1544
- docs = loader.load()
1545
- if docs:
1546
- text = "\n".join(doc.page_content for doc in docs if hasattr(doc, 'page_content'))
1547
- docs_by_file.append(text)
1548
- all_docs.extend(docs)
1549
- except Exception as e:
1550
- print(f"ERROR processing file: {str(e)}")
1551
- continue
1552
-
1553
- if not docs_by_file:
1554
  return general_run({"query": query})["answer"]
1555
 
1556
- # Build the retriever using Pinecone instead of FAISS, with fixed index name and namespace "Rag_Docs"
1557
- try:
1558
- import pinecone
1559
- # Initialize Pinecone (ensure the environment variables PINECONE_API_KEY and PINECONE_ENVIRONMENT are set)
1560
- pinecone.init(
1561
- api_key=os.getenv("PINECONE_API_KEY"),
1562
- environment=os.getenv("PINECONE_ENVIRONMENT")
1563
- )
1564
-
1565
- # Use fixed index name "Rag_Docs" as configured in your Pinecone account
1566
- index_name = "Rag_Docs"
1567
- # Check if the index exists; if not, create it with the appropriate dimension (768 for all-mpnet-base-v2)
1568
- if index_name not in pinecone.list_indexes():
1569
- pinecone.create_index(
1570
- name=index_name,
1571
- dimension=768, # Ensure the embedding model dimension matches; for all-mpnet-base-v2, it is 768
1572
- metric="cosine"
1573
- )
1574
-
1575
- # Split documents into chunks for finer retrieval
1576
- chunks = RecursiveCharacterTextSplitter(
1577
- chunk_size=500,
1578
- chunk_overlap=50
1579
- ).split_documents(all_docs)
1580
-
1581
- # Create or update the Pinecone index from the document chunks using a fixed namespace "Rag_Docs"
1582
- vectorstore = Pinecone.from_documents(
1583
- documents=chunks,
1584
- embedding=embeddings,
1585
- index_name=index_name,
1586
- namespace="Rag_Docs" # 固定使用 "Rag_Docs" 作為 namespace,與你的 Pinecone 介面設定一致
1587
- )
1588
-
1589
- # Create the retriever from the vector store
1590
- retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
1591
-
1592
- global session_retriever, session_qa_chain
1593
- session_retriever = retriever
1594
- session_qa_chain = ConversationalRetrievalChain.from_llm(
1595
- llm=llm_gpt4,
1596
- retriever=retriever,
1597
- memory=ConversationBufferMemory(
1598
- memory_key="chat_history",
1599
- return_messages=True
1600
- ),
1601
- )
1602
- except Exception as e:
1603
- print(f"ERROR setting up Pinecone retriever: {str(e)}")
1604
- retriever = None
1605
-
1606
- # If the query is a multi-document query or a complex query, use AutoGen collaboration
1607
- if len(docs_by_file) > 1 or "compare" in query.lower() or "relation" in query.lower():
1608
- return autogen_multi_document_analysis(query, docs_by_file, file_names)
1609
-
1610
- # Use LangGraph to process single-document queries
1611
- state = {
1612
- "query": query,
1613
- "file_names": file_names,
1614
- "docs": docs_by_file,
1615
- "retriever": retriever
1616
- }
1617
-
1618
- # Choose processing method based on query intent
1619
- intent = detect_intent_embedding(query, file_names)
1620
- if intent == "Summarise":
1621
- return summariser_run(state)["summary"]
1622
- elif intent == "DocQA":
1623
- return docqa_run(state)["answer"]
1624
- else:
1625
- return general_run(state)["answer"]
1626
-
1627
  except Exception as e:
1628
- print(f"ERROR in main function: {str(e)}")
1629
- return f"I apologize, but I encountered an error: {str(e)}"
1630
 
1631
  # Gradio Interface Settings
1632
  demo_description = """
 
1232
  except Exception as e:
1233
  return f"Multi-Agent Error: {e}"
1234
 
1235
+ # === Tab 6: Smart Multi‐Document QA (LangGraph + AutoGen + Pinecone Retriever) ===
1236
+
1237
+ # Initialize the sentence‐transformer embedding model
1238
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
1239
 
1240
+ # Define intent labels and example phrases for embedding‐based routing
1241
  INTENT_LABELS = {
1242
+ "DocQA": ["document", "file", "paper", "proposal", "project"],
1243
+ "Summarise":["summarise", "summary", "abstract", "key points", "overview", "main points"],
1244
+ "General": ["who are you", "tell me something", "what can you do", "fun fact"],
1245
  }
1246
 
 
1247
  def detect_intent_embedding(query, file_names=[]):
1248
+ """
1249
+ Compute embedding of the user query, compare against each intent's example embeddings,
1250
+ and return the label with highest cosine similarity.
1251
+ """
1252
  query_emb = embedding_model.encode(query, normalize_embeddings=True)
1253
+ best_label, best_score = None, -1.0
1254
+ # include file names as additional examples for DocQA
1255
+ phrases = { **INTENT_LABELS }
1256
  if file_names:
1257
+ phrases["DocQA"] += [name.lower() for name in file_names]
1258
+ # find highest scoring intent
1259
+ for label, examples in phrases.items():
1260
+ for ex in examples:
1261
+ ex_emb = embedding_model.encode(ex, normalize_embeddings=True)
1262
+ score = float(query_emb @ ex_emb.T)
1263
  if score > best_score:
1264
+ best_label, best_score = label, score
1265
+ return best_label or "General"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1266
 
1267
+ def decide_next(state):
1268
+ """
1269
+ LangGraph router node: choose next node based on detected intent label.
1270
+ """
1271
+ label = detect_intent_embedding(state["query"], state.get("file_names", []))
1272
+ return label
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1273
 
1274
+ def autogen_multi_document_analysis(query: str, docs: list, file_names: list) -> str:
1275
+ """
1276
+ When multiple documents or comparison/relationship queries arise,
1277
+ spin up an AutoGen group chat of specialists (document analysis, Q&A, summariser)
1278
+ to collaboratively reason across all file contexts.
1279
+ """
1280
+ # prepare a temporary workspace for cache
1281
+ temp_dir = tempfile.mkdtemp(dir="/tmp")
1282
+ os.environ["OPENAI_CACHE_DIR"] = temp_dir
1283
+ os.environ["AUTOGEN_CACHE_PATH"] = temp_dir
1284
+ os.environ["AUTOGEN_CACHEDIR"] = temp_dir
1285
+ os.environ["OPENAI_CACHE_PATH"] = temp_dir
1286
+ if hasattr(autogen, "set_cache_dir"):
1287
+ autogen.set_cache_dir(temp_dir)
1288
+
1289
+ # build combined context snippet for each document
1290
+ context = "\n\n".join(f"Document {name}:\n{doc[:2000]}..."
1291
+ for name, doc in zip(file_names, docs))
1292
+
1293
+ # configure LLM settings for AutoGen
1294
+ llm_config = {
1295
+ "config_list": [{"model":"gpt-4o-mini", "api_key": openai_api_key}],
1296
+ "temperature": 0
1297
+ }
1298
+
1299
+ # instantiate agents
1300
+ user_proxy = UserProxyAgent( name="User",
1301
+ system_message="User seeking cross-document analysis.",
1302
+ human_input_mode="NEVER",
1303
+ code_execution_config={"use_docker":False},
1304
+ llm_config=llm_config
1305
+ )
1306
+ doc_analyzer = AssistantAgent( name="DocumentAnalyzer",
1307
+ system_message="Expert on comparing document content and structure.",
1308
+ llm_config=llm_config
1309
+ )
1310
+ qa_expert = AssistantAgent( name="QAExpert",
1311
+ system_message="Expert at extracting precise answers from text.",
1312
+ llm_config=llm_config
1313
+ )
1314
+ summarizer = AssistantAgent( name="Summarizer",
1315
+ system_message="Expert at generating concise summaries.",
1316
+ llm_config=llm_config
1317
+ )
1318
 
1319
+ # launch a small group chat
1320
+ group = GroupChat(
1321
+ agents=[user_proxy, doc_analyzer, qa_expert, summarizer],
1322
+ messages=[], max_round=5
1323
+ )
1324
+ manager = GroupChatManager(groupchat=group, llm_config=llm_config)
1325
 
1326
+ # build the task prompt
1327
+ prompt = f"""Analyze these documents and answer the query:
1328
 
1329
+ Query: {query}
 
 
 
 
1330
 
1331
+ Documents Context:
1332
+ {context}
1333
 
1334
+ Requirements:
1335
+ 1. Provide a direct and clear answer
1336
+ 2. Support claims with evidence from the documents
1337
+ 3. Consider relationships and comparisons where relevant
1338
+ Please produce a well-structured answer."""
1339
+ user_proxy.initiate_chat(manager, message=prompt)
1340
+ return user_proxy.last_message()["content"]
1341
 
1342
  def general_run(state):
1343
+ """
1344
+ LangGraph 'General' node: fallback that asks GPT-4o-mini directly for general queries.
1345
+ """
1346
+ response = llm_gpt4.invoke(f"You are a helpful assistant. Answer concisely:\n{state['query']}")
1347
+ return {"answer": getattr(response, "content", str(response))}
 
 
 
 
 
 
 
 
1348
 
1349
  def docqa_run(state):
1350
+ """
1351
+ LangGraph 'DocQA' node: retrieve from the provided retriever and answer via LLM.
1352
+ """
1353
+ retriever = state.get("retriever")
1354
+ if retriever:
1355
+ docs = retriever.get_relevant_documents(state["query"])
1356
+ context = "\n".join(d.page_content for d in docs)
1357
+ else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1358
  context = "\n".join(state["docs"])
1359
+ prompt = f"Based on the following context, answer the question:\n\nContext:\n{context[:3000]}\n\nQuestion: {state['query']}"
1360
+ response = llm_gpt4.invoke(prompt)
1361
+ return {"answer": getattr(response, "content", str(response))}
 
 
 
 
 
 
 
 
 
 
1362
 
1363
+ def summariser_run(state):
1364
+ """
1365
+ LangGraph 'Summarise' node: produce a concise summary of the combined documents.
1366
+ """
1367
+ context = "\n".join(state["docs"])
1368
+ prompt = f"Please summarise the following content:\n\n{context[:3000]}"
1369
+ response = llm_gpt4.invoke(prompt)
1370
+ return {"summary": getattr(response, "content", str(response))}
1371
+
1372
+ def build_langgraph_pipeline():
1373
+ """
1374
+ Assemble the LangGraph state graph: Router -> {DocQA, Summarise, General}.
1375
+ """
1376
+ graph = StateGraph(dict)
1377
+ graph.add_node("Router", lambda state: state)
1378
+ graph.add_node("DocQA", docqa_run)
1379
+ graph.add_node("Summarise", summariser_run)
1380
+ graph.add_node("General", general_run)
1381
+ graph.set_entry_point("Router")
1382
+ graph.add_conditional_edges("Router", decide_next, {
1383
+ "DocQA": "DocQA",
1384
+ "Summarise": "Summarise",
1385
+ "General": "General"
1386
+ })
1387
+ graph.set_finish_point("DocQA")
1388
+ graph.set_finish_point("Summarise")
1389
+ graph.set_finish_point("General")
1390
+ return graph.compile()
1391
 
1392
  def get_file_path_tab6(file):
1393
  if isinstance(file, str):
 
1394
  if os.path.exists(file):
 
1395
  return file
1396
  else:
 
1397
  return None
1398
  elif isinstance(file, dict):
 
1399
  data = file.get("data")
1400
  name = file.get("name")
 
1401
  if data:
1402
  if isinstance(data, str) and os.path.exists(data):
 
1403
  return data
1404
  else:
1405
  temp_dir = mkdtemp()
1406
  file_path = os.path.join(temp_dir, name if name else "uploaded_file")
 
1407
  with open(file_path, "wb") as f:
1408
  if isinstance(data, str):
1409
  f.write(data.encode("utf-8"))
1410
  else:
1411
  f.write(data)
1412
+ return file_path if os.path.exists(file_path) else None
 
 
 
 
 
1413
  else:
 
1414
  return None
1415
  elif hasattr(file, "save"):
 
1416
  temp_dir = mkdtemp()
1417
  file_path = os.path.join(temp_dir, file.name)
1418
  file.save(file_path)
1419
+ return file_path if os.path.exists(file_path) else None
 
 
 
 
 
1420
  else:
1421
+ if hasattr(file, "name") and os.path.exists(file.name):
1422
+ return file.name
 
 
1423
  return None
1424
 
1425
+ @traceable(name="multi_doc")
1426
  def langgraph_tab6_main(query: str, file=None):
1427
+ """
1428
+ Main entrypoint for Tab 6.
1429
+ 1. If no file: call general_run.
1430
+ 2. Load one or more docs, chunk them.
1431
+ 3. Initialize Pinecone index 'Rag_Docs' with dimension=768, metric=cosine.
1432
+ 4. Upsert chunks into Pinecone under namespace 'Rag_Docs'.
1433
+ 5. Build retriever and ConversationalRetrievalChain.
1434
+ 6. If multi‐doc or comparison query → autogen_multi_document_analysis.
1435
+ 7. Else route through LangGraph pipeline.
1436
+ """
1437
  try:
 
 
 
1438
  if not file:
1439
  return general_run({"query": query})["answer"]
1440
+
1441
+ # prepare file list and load content
1442
  files = file if isinstance(file, list) else [file]
1443
+ all_docs, file_names, docs_text = [], [], []
 
 
 
 
1444
  for f in files:
1445
+ path = get_file_path_tab6(f)
1446
+ if not path: continue
1447
+ file_names.append(os.path.basename(path))
1448
+ loader = (PyPDFLoader if path.endswith(".pdf") else
1449
+ UnstructuredWordDocumentLoader if path.endswith(".docx") else
1450
+ TextLoader)(path)
1451
+ docs = loader.load()
1452
+ if docs:
1453
+ docs_text.append("\n".join(d.page_content for d in docs))
1454
+ all_docs.extend(docs)
1455
+
1456
+ if not docs_text:
 
 
 
 
 
 
 
 
 
 
 
 
 
1457
  return general_run({"query": query})["answer"]
1458
 
1459
+ # initialize Pinecone
1460
+ import pinecone
1461
+ pinecone.init(api_key=os.getenv("PINECONE_API_KEY"),
1462
+ environment=os.getenv("PINECONE_ENVIRONMENT"))
1463
+ index_name = "Rag_Docs"
1464
+ if index_name not in pinecone.list_indexes():
1465
+ pinecone.create_index(name=index_name, dimension=768, metric="cosine")
1466
+
1467
+ # chunk documents and upsert into Pinecone
1468
+ chunks = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50).split_documents(all_docs)
1469
+ vectorstore = Pinecone.from_documents(
1470
+ documents=chunks,
1471
+ embedding=embeddings,
1472
+ index_name=index_name,
1473
+ namespace="Rag_Docs"
1474
+ )
1475
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
1476
+
1477
+ # set up conversational chain
1478
+ global session_retriever, session_qa_chain
1479
+ session_retriever = retriever
1480
+ session_qa_chain = ConversationalRetrievalChain.from_llm(
1481
+ llm=llm_gpt4,
1482
+ retriever=retriever,
1483
+ memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True)
1484
+ )
1485
+
1486
+ # if multi‐doc or comparison request, hand off to AutoGen
1487
+ if len(docs_text) > 1 or "compare" in query.lower() or "relation" in query.lower():
1488
+ return autogen_multi_document_analysis(query, docs_text, file_names)
1489
+
1490
+ # otherwise, run through LangGraph
1491
+ state = {"query": query, "file_names": file_names, "docs": docs_text, "retriever": retriever}
1492
+ pipeline = build_langgraph_pipeline()
1493
+ out = pipeline.invoke(state)
1494
+ return out.get("answer") or out.get("summary")
1495
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1496
  except Exception as e:
1497
+ print(f"ERROR in Tab6 main: {e}")
1498
+ return f"Sorry, an error occurred: {e}"
1499
 
1500
  # Gradio Interface Settings
1501
  demo_description = """