Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,7 @@ import requests
|
|
| 8 |
import transformers
|
| 9 |
import chardet
|
| 10 |
import deepeval
|
|
|
|
| 11 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 12 |
from transformers.models.llama.configuration_llama import LlamaConfig
|
| 13 |
from huggingface_hub import hf_hub_download
|
|
@@ -1388,18 +1389,23 @@ def get_file_path_tab6(file):
|
|
| 1388 |
return None
|
| 1389 |
|
| 1390 |
# 修改後的 Tab6 主執行函式
|
|
|
|
| 1391 |
def langgraph_tab6_main(query: str, file=None):
|
| 1392 |
try:
|
| 1393 |
# 以多檔案模式處理上傳的檔案
|
| 1394 |
files = file if isinstance(file, list) else [file] if file else []
|
| 1395 |
-
all_docs
|
|
|
|
|
|
|
|
|
|
| 1396 |
for f in files:
|
| 1397 |
-
path = get_file_path(f) # 使用
|
| 1398 |
if not path:
|
| 1399 |
print("get_file_path returned None for file:", f)
|
| 1400 |
continue
|
| 1401 |
file_names.append(os.path.basename(path))
|
| 1402 |
print("Processing file:", path)
|
|
|
|
| 1403 |
if path.lower().endswith(".pdf"):
|
| 1404 |
loader = PyPDFLoader(path)
|
| 1405 |
elif path.lower().endswith(".docx"):
|
|
@@ -1408,8 +1414,29 @@ def langgraph_tab6_main(query: str, file=None):
|
|
| 1408 |
loader = TextLoader(path)
|
| 1409 |
docs = loader.load()
|
| 1410 |
print("Docs loaded from", path, ":", docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1411 |
all_docs.extend(docs)
|
| 1412 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1413 |
if not all_docs:
|
| 1414 |
print("No document content read. file_names:", file_names)
|
| 1415 |
retriever = None
|
|
@@ -1417,10 +1444,8 @@ def langgraph_tab6_main(query: str, file=None):
|
|
| 1417 |
chunks = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50).split_documents(all_docs)
|
| 1418 |
db = FAISS.from_documents(chunks, embeddings)
|
| 1419 |
retriever = db.as_retriever()
|
| 1420 |
-
# 將建立好的 retriever 指派到全域變數 session_retriever
|
| 1421 |
global session_retriever
|
| 1422 |
session_retriever = retriever
|
| 1423 |
-
# 同時建立 Document QA 的 ConversationalRetrievalChain,供 uploaded_qa_tool_func 使用
|
| 1424 |
global session_qa_chain
|
| 1425 |
session_qa_chain = ConversationalRetrievalChain.from_llm(
|
| 1426 |
llm=llm_gpt4,
|
|
@@ -1440,7 +1465,6 @@ def langgraph_tab6_main(query: str, file=None):
|
|
| 1440 |
return "No answer."
|
| 1441 |
except Exception as e:
|
| 1442 |
return f"[Tab6 Error] {e}"
|
| 1443 |
-
|
| 1444 |
|
| 1445 |
# Gradio Interface Settings
|
| 1446 |
demo_description = """
|
|
|
|
| 8 |
import transformers
|
| 9 |
import chardet
|
| 10 |
import deepeval
|
| 11 |
+
import difflib
|
| 12 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 13 |
from transformers.models.llama.configuration_llama import LlamaConfig
|
| 14 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 1389 |
return None
|
| 1390 |
|
| 1391 |
# 修改後的 Tab6 主執行函式
|
| 1392 |
+
|
| 1393 |
def langgraph_tab6_main(query: str, file=None):
|
| 1394 |
try:
|
| 1395 |
# 以多檔案模式處理上傳的檔案
|
| 1396 |
files = file if isinstance(file, list) else [file] if file else []
|
| 1397 |
+
all_docs = [] # 用來合併所有檔案內文(後續用於建立檢索器)
|
| 1398 |
+
file_names = [] # 儲存每份檔案的檔名
|
| 1399 |
+
docs_by_file = [] # 儲存每份檔案的完整文字(分開保存)
|
| 1400 |
+
|
| 1401 |
for f in files:
|
| 1402 |
+
path = get_file_path(f) # 使用現有的 get_file_path
|
| 1403 |
if not path:
|
| 1404 |
print("get_file_path returned None for file:", f)
|
| 1405 |
continue
|
| 1406 |
file_names.append(os.path.basename(path))
|
| 1407 |
print("Processing file:", path)
|
| 1408 |
+
# 根據副檔名選擇對應 Loader
|
| 1409 |
if path.lower().endswith(".pdf"):
|
| 1410 |
loader = PyPDFLoader(path)
|
| 1411 |
elif path.lower().endswith(".docx"):
|
|
|
|
| 1414 |
loader = TextLoader(path)
|
| 1415 |
docs = loader.load()
|
| 1416 |
print("Docs loaded from", path, ":", docs)
|
| 1417 |
+
# 將文件內容合併成單一文字(假設每個 doc 有 page_content 屬性,否則直接 join)
|
| 1418 |
+
if docs and hasattr(docs[0], "page_content"):
|
| 1419 |
+
text = "\n".join([doc.page_content for doc in docs])
|
| 1420 |
+
else:
|
| 1421 |
+
text = "\n".join(docs)
|
| 1422 |
+
docs_by_file.append(text)
|
| 1423 |
all_docs.extend(docs)
|
| 1424 |
|
| 1425 |
+
# 如果上傳了兩份以上檔案,且查詢中包含 "differ"(例如 "difference", "different")
|
| 1426 |
+
if len(docs_by_file) >= 2 and "differ" in query.lower():
|
| 1427 |
+
diff = difflib.unified_diff(
|
| 1428 |
+
docs_by_file[0].splitlines(),
|
| 1429 |
+
docs_by_file[1].splitlines(),
|
| 1430 |
+
fromfile=file_names[0],
|
| 1431 |
+
tofile=file_names[1],
|
| 1432 |
+
lineterm=''
|
| 1433 |
+
)
|
| 1434 |
+
diff_text = "\n".join(list(diff))
|
| 1435 |
+
if not diff_text.strip():
|
| 1436 |
+
diff_text = "The two documents appear to be identical."
|
| 1437 |
+
return diff_text
|
| 1438 |
+
|
| 1439 |
+
# 否則,採用合併所有檔案內文的方式建立檢索器
|
| 1440 |
if not all_docs:
|
| 1441 |
print("No document content read. file_names:", file_names)
|
| 1442 |
retriever = None
|
|
|
|
| 1444 |
chunks = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50).split_documents(all_docs)
|
| 1445 |
db = FAISS.from_documents(chunks, embeddings)
|
| 1446 |
retriever = db.as_retriever()
|
|
|
|
| 1447 |
global session_retriever
|
| 1448 |
session_retriever = retriever
|
|
|
|
| 1449 |
global session_qa_chain
|
| 1450 |
session_qa_chain = ConversationalRetrievalChain.from_llm(
|
| 1451 |
llm=llm_gpt4,
|
|
|
|
| 1465 |
return "No answer."
|
| 1466 |
except Exception as e:
|
| 1467 |
return f"[Tab6 Error] {e}"
|
|
|
|
| 1468 |
|
| 1469 |
# Gradio Interface Settings
|
| 1470 |
demo_description = """
|