ChienChung commited on
Commit
e8a83a0
·
verified ·
1 Parent(s): a3bc85b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -5
app.py CHANGED
@@ -8,6 +8,7 @@ import requests
8
  import transformers
9
  import chardet
10
  import deepeval
 
11
  from transformers import AutoModelForCausalLM, AutoTokenizer
12
  from transformers.models.llama.configuration_llama import LlamaConfig
13
  from huggingface_hub import hf_hub_download
@@ -1388,18 +1389,23 @@ def get_file_path_tab6(file):
1388
  return None
1389
 
1390
  # 修改後的 Tab6 主執行函式
 
1391
  def langgraph_tab6_main(query: str, file=None):
1392
  try:
1393
  # 以多檔案模式處理上傳的檔案
1394
  files = file if isinstance(file, list) else [file] if file else []
1395
- all_docs, file_names = [], []
 
 
 
1396
  for f in files:
1397
- path = get_file_path(f) # 使用有的 get_file_path
1398
  if not path:
1399
  print("get_file_path returned None for file:", f)
1400
  continue
1401
  file_names.append(os.path.basename(path))
1402
  print("Processing file:", path)
 
1403
  if path.lower().endswith(".pdf"):
1404
  loader = PyPDFLoader(path)
1405
  elif path.lower().endswith(".docx"):
@@ -1408,8 +1414,29 @@ def langgraph_tab6_main(query: str, file=None):
1408
  loader = TextLoader(path)
1409
  docs = loader.load()
1410
  print("Docs loaded from", path, ":", docs)
 
 
 
 
 
 
1411
  all_docs.extend(docs)
1412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1413
  if not all_docs:
1414
  print("No document content read. file_names:", file_names)
1415
  retriever = None
@@ -1417,10 +1444,8 @@ def langgraph_tab6_main(query: str, file=None):
1417
  chunks = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50).split_documents(all_docs)
1418
  db = FAISS.from_documents(chunks, embeddings)
1419
  retriever = db.as_retriever()
1420
- # 將建立好的 retriever 指派到全域變數 session_retriever
1421
  global session_retriever
1422
  session_retriever = retriever
1423
- # 同時建立 Document QA 的 ConversationalRetrievalChain,供 uploaded_qa_tool_func 使用
1424
  global session_qa_chain
1425
  session_qa_chain = ConversationalRetrievalChain.from_llm(
1426
  llm=llm_gpt4,
@@ -1440,7 +1465,6 @@ def langgraph_tab6_main(query: str, file=None):
1440
  return "No answer."
1441
  except Exception as e:
1442
  return f"[Tab6 Error] {e}"
1443
-
1444
 
1445
  # Gradio Interface Settings
1446
  demo_description = """
 
8
  import transformers
9
  import chardet
10
  import deepeval
11
+ import difflib
12
  from transformers import AutoModelForCausalLM, AutoTokenizer
13
  from transformers.models.llama.configuration_llama import LlamaConfig
14
  from huggingface_hub import hf_hub_download
 
1389
  return None
1390
 
1391
  # 修改後的 Tab6 主執行函式
1392
+
1393
  def langgraph_tab6_main(query: str, file=None):
1394
  try:
1395
  # 以多檔案模式處理上傳的檔案
1396
  files = file if isinstance(file, list) else [file] if file else []
1397
+ all_docs = [] # 用來合併所有檔案內文(後續用於建立檢索器)
1398
+ file_names = [] # 儲存每份檔案的檔名
1399
+ docs_by_file = [] # 儲存每份檔案的完整文字(分開保存)
1400
+
1401
  for f in files:
1402
+ path = get_file_path(f) # 使用有的 get_file_path
1403
  if not path:
1404
  print("get_file_path returned None for file:", f)
1405
  continue
1406
  file_names.append(os.path.basename(path))
1407
  print("Processing file:", path)
1408
+ # 根據副檔名選擇對應 Loader
1409
  if path.lower().endswith(".pdf"):
1410
  loader = PyPDFLoader(path)
1411
  elif path.lower().endswith(".docx"):
 
1414
  loader = TextLoader(path)
1415
  docs = loader.load()
1416
  print("Docs loaded from", path, ":", docs)
1417
+ # 將文件內容合併成單一文字(假設每個 doc 有 page_content 屬性,否則直接 join)
1418
+ if docs and hasattr(docs[0], "page_content"):
1419
+ text = "\n".join([doc.page_content for doc in docs])
1420
+ else:
1421
+ text = "\n".join(docs)
1422
+ docs_by_file.append(text)
1423
  all_docs.extend(docs)
1424
 
1425
+ # 如果上傳了兩份以上檔案,且查詢中包含 "differ"(例如 "difference", "different")
1426
+ if len(docs_by_file) >= 2 and "differ" in query.lower():
1427
+ diff = difflib.unified_diff(
1428
+ docs_by_file[0].splitlines(),
1429
+ docs_by_file[1].splitlines(),
1430
+ fromfile=file_names[0],
1431
+ tofile=file_names[1],
1432
+ lineterm=''
1433
+ )
1434
+ diff_text = "\n".join(list(diff))
1435
+ if not diff_text.strip():
1436
+ diff_text = "The two documents appear to be identical."
1437
+ return diff_text
1438
+
1439
+ # 否則,採用合併所有檔案內文的方式建立檢索器
1440
  if not all_docs:
1441
  print("No document content read. file_names:", file_names)
1442
  retriever = None
 
1444
  chunks = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50).split_documents(all_docs)
1445
  db = FAISS.from_documents(chunks, embeddings)
1446
  retriever = db.as_retriever()
 
1447
  global session_retriever
1448
  session_retriever = retriever
 
1449
  global session_qa_chain
1450
  session_qa_chain = ConversationalRetrievalChain.from_llm(
1451
  llm=llm_gpt4,
 
1465
  return "No answer."
1466
  except Exception as e:
1467
  return f"[Tab6 Error] {e}"
 
1468
 
1469
  # Gradio Interface Settings
1470
  demo_description = """