Update app.py
Browse files
app.py
CHANGED
|
@@ -48,94 +48,96 @@ def process_pdf(pdf_path):
|
|
| 48 |
)
|
| 49 |
texts = text_splitter.split_documents(documents)
|
| 50 |
|
| 51 |
-
|
|
|
|
| 52 |
vectorstore = FAISS.from_documents(texts, embeddings)
|
| 53 |
return vectorstore
|
| 54 |
|
| 55 |
def postprocess_answer(answer):
|
|
|
|
| 56 |
replacements = {
|
| 57 |
-
"the context": "
|
| 58 |
-
"according to the document": "
|
| 59 |
-
"it is stated that": "
|
| 60 |
-
"the answer is": "
|
| 61 |
-
"based on the information": "
|
| 62 |
}
|
| 63 |
|
| 64 |
-
for eng,
|
| 65 |
-
answer = answer.replace(eng,
|
| 66 |
|
|
|
|
| 67 |
answer = answer.strip()
|
| 68 |
if answer and len(answer) > 0:
|
| 69 |
answer = answer[0].upper() + answer[1:]
|
| 70 |
|
|
|
|
| 71 |
if len(answer.split()) < 4:
|
| 72 |
-
answer = "
|
| 73 |
|
| 74 |
return answer
|
| 75 |
|
| 76 |
-
#
|
| 77 |
-
template = """
|
| 78 |
{context}
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
|
| 83 |
QA_PROMPT = PromptTemplate(
|
| 84 |
template=template,
|
| 85 |
-
input_variables=["context", "question"]
|
| 86 |
)
|
| 87 |
|
| 88 |
def main():
|
| 89 |
-
st.set_page_config(page_title="
|
| 90 |
-
st.title("
|
| 91 |
-
st.markdown("
|
| 92 |
|
| 93 |
-
uploaded_file = st.file_uploader("
|
| 94 |
|
| 95 |
if uploaded_file is not None:
|
| 96 |
with open("temp.pdf", "wb") as f:
|
| 97 |
f.write(uploaded_file.getbuffer())
|
| 98 |
|
| 99 |
-
with st.spinner("
|
| 100 |
vectorstore = process_pdf("temp.pdf")
|
| 101 |
|
| 102 |
llm = load_llm()
|
| 103 |
|
| 104 |
-
# Thêm tham số input_key để ánh xạ đúng tên biến
|
| 105 |
qa_chain = RetrievalQA.from_chain_type(
|
| 106 |
llm=llm,
|
| 107 |
chain_type="stuff",
|
| 108 |
retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
|
| 109 |
return_source_documents=True,
|
| 110 |
-
input_key="question",
|
| 111 |
chain_type_kwargs={
|
| 112 |
"prompt": QA_PROMPT,
|
| 113 |
"document_variable_name": "context"
|
| 114 |
}
|
| 115 |
)
|
| 116 |
|
| 117 |
-
query = st.text_input("
|
| 118 |
if query:
|
| 119 |
-
with st.spinner("
|
| 120 |
try:
|
| 121 |
-
|
| 122 |
-
result = qa_chain({"question": query}) # Đổi thành 'question'
|
| 123 |
raw_answer = result["result"]
|
| 124 |
answer = postprocess_answer(raw_answer)
|
| 125 |
|
| 126 |
-
st.markdown("###
|
| 127 |
st.success(answer)
|
| 128 |
|
| 129 |
-
with st.expander("
|
| 130 |
for i, doc in enumerate(result["source_documents"]):
|
| 131 |
-
st.markdown(f"
|
| 132 |
st.info(doc.page_content[:500] + "...")
|
| 133 |
|
| 134 |
except Exception as e:
|
| 135 |
-
st.error(f"
|
| 136 |
|
| 137 |
else:
|
| 138 |
-
st.info("
|
| 139 |
|
| 140 |
if __name__ == "__main__":
|
| 141 |
main()
|
|
|
|
| 48 |
)
|
| 49 |
texts = text_splitter.split_documents(documents)
|
| 50 |
|
| 51 |
+
# Sử dụng model embedding đa ngôn ngữ
|
| 52 |
+
embeddings = SentenceTransformerEmbeddings(model_name="paraphrase-multilingual-mpnet-base-v2")
|
| 53 |
vectorstore = FAISS.from_documents(texts, embeddings)
|
| 54 |
return vectorstore
|
| 55 |
|
| 56 |
def postprocess_answer(answer):
|
| 57 |
+
# Thay thế các cụm từ không tự nhiên trong tiếng Nhật
|
| 58 |
replacements = {
|
| 59 |
+
"the context": "ドキュメント",
|
| 60 |
+
"according to the document": "文書によりますと",
|
| 61 |
+
"it is stated that": "記載されている内容では",
|
| 62 |
+
"the answer is": "答えは",
|
| 63 |
+
"based on the information": "提供された情報に基づきますと"
|
| 64 |
}
|
| 65 |
|
| 66 |
+
for eng, jp in replacements.items():
|
| 67 |
+
answer = answer.replace(eng, jp)
|
| 68 |
|
| 69 |
+
# Chuẩn hóa định dạng tiếng Nhật
|
| 70 |
answer = answer.strip()
|
| 71 |
if answer and len(answer) > 0:
|
| 72 |
answer = answer[0].upper() + answer[1:]
|
| 73 |
|
| 74 |
+
# Kiểm tra câu trả lời ngắn
|
| 75 |
if len(answer.split()) < 4:
|
| 76 |
+
answer = "情報が不足しているようです。 " + answer
|
| 77 |
|
| 78 |
return answer
|
| 79 |
|
| 80 |
+
# Prompt template tiếng Nhật
|
| 81 |
+
template = """以下の内容に基づいて質問に自然な日本語で回答してください:
|
| 82 |
{context}
|
| 83 |
|
| 84 |
+
質問: {question}
|
| 85 |
+
回答:"""
|
| 86 |
|
| 87 |
QA_PROMPT = PromptTemplate(
|
| 88 |
template=template,
|
| 89 |
+
input_variables=["context", "question"]
|
| 90 |
)
|
| 91 |
|
| 92 |
def main():
|
| 93 |
+
st.set_page_config(page_title="PDFアシスタント", page_icon="📘")
|
| 94 |
+
st.title("PDFアシスタント 🤖")
|
| 95 |
+
st.markdown("PDFファイルをアップロードして内容について質問してください")
|
| 96 |
|
| 97 |
+
uploaded_file = st.file_uploader("PDFファイルを選択", type="pdf")
|
| 98 |
|
| 99 |
if uploaded_file is not None:
|
| 100 |
with open("temp.pdf", "wb") as f:
|
| 101 |
f.write(uploaded_file.getbuffer())
|
| 102 |
|
| 103 |
+
with st.spinner("ドキュメントを分析中..."):
|
| 104 |
vectorstore = process_pdf("temp.pdf")
|
| 105 |
|
| 106 |
llm = load_llm()
|
| 107 |
|
|
|
|
| 108 |
qa_chain = RetrievalQA.from_chain_type(
|
| 109 |
llm=llm,
|
| 110 |
chain_type="stuff",
|
| 111 |
retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
|
| 112 |
return_source_documents=True,
|
| 113 |
+
input_key="question",
|
| 114 |
chain_type_kwargs={
|
| 115 |
"prompt": QA_PROMPT,
|
| 116 |
"document_variable_name": "context"
|
| 117 |
}
|
| 118 |
)
|
| 119 |
|
| 120 |
+
query = st.text_input("ドキュメントに関する質問を入力:")
|
| 121 |
if query:
|
| 122 |
+
with st.spinner("回答を生成中..."):
|
| 123 |
try:
|
| 124 |
+
result = qa_chain({"question": query})
|
|
|
|
| 125 |
raw_answer = result["result"]
|
| 126 |
answer = postprocess_answer(raw_answer)
|
| 127 |
|
| 128 |
+
st.markdown("### 回答")
|
| 129 |
st.success(answer)
|
| 130 |
|
| 131 |
+
with st.expander("参考資料を表示"):
|
| 132 |
for i, doc in enumerate(result["source_documents"]):
|
| 133 |
+
st.markdown(f"**引用 {i+1}:**")
|
| 134 |
st.info(doc.page_content[:500] + "...")
|
| 135 |
|
| 136 |
except Exception as e:
|
| 137 |
+
st.error(f"エラーが発生しました: {str(e)}")
|
| 138 |
|
| 139 |
else:
|
| 140 |
+
st.info("PDFファイルをアップロードしてください")
|
| 141 |
|
| 142 |
if __name__ == "__main__":
|
| 143 |
main()
|