Spaces:

uyen13
/

chatbot

Sleeping

App Files Files Community

uyen13 commited on May 13, 2025

Commit

694a2d1

verified ·

1 Parent(s): 75d3fac

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -34

app.py CHANGED Viewed

@@ -5,19 +5,20 @@ from langchain.text_splitter import CharacterTextSplitter
 from langchain.embeddings import SentenceTransformerEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.chains import RetrievalQA
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import os
 import torch
-# Load FLAN-T5 model
 @st.cache_resource
 def load_llm():
     model_name = "google/flan-t5-xl"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_name,
-        torch_dtype=torch.float32,  # T5 thường dùng float32 hoặc bfloat16 nếu có GPU hỗ trợ
         device_map="auto"
     )
@@ -25,72 +26,119 @@ def load_llm():
         "text2text-generation",
         model=model,
         tokenizer=tokenizer,
-        max_new_tokens=256,
-        temperature=0.7,
-        top_p=0.95,
-        repetition_penalty=1.15,
         do_sample=True
     )
     return HuggingFacePipeline(pipeline=pipe)
-# Process PDF and create vectorstore
 def process_pdf(pdf_path):
     loader = PyPDFLoader(pdf_path)
     documents = loader.load()
-    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     texts = text_splitter.split_documents(documents)
     embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
     vectorstore = FAISS.from_documents(texts, embeddings)
     return vectorstore
 def main():
-    st.set_page_config(page_title="PDF Chatbot", page_icon="📄")
-    st.title("PDF Chatbot 📄")
-    st.markdown("Upload a PDF and ask questions about its content using FLAN-T5!")
-    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
     if uploaded_file is not None:
-        # Save uploaded file temporarily
         with open("temp.pdf", "wb") as f:
             f.write(uploaded_file.getbuffer())
-        # Process PDF
-        with st.spinner("Processing PDF..."):
             vectorstore = process_pdf("temp.pdf")
-        # Load LLM
         llm = load_llm()
-        # Create QA chain
         qa_chain = RetrievalQA.from_chain_type(
             llm=llm,
             chain_type="stuff",
             retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
-            return_source_documents=True
         )
-        # Query input
-        query = st.text_input("Ask a question about the PDF:")
         if query:
-            with st.spinner("Generating answer..."):
-                result = qa_chain({"query": query})
-                answer = result["result"]
-                source_docs = result["source_documents"]
-                st.markdown("### Answer")
-                st.write(answer)
-                with st.expander("Show Source Documents"):
-                    for i, doc in enumerate(source_docs):
-                        st.markdown(f"**Source {i+1}:**")
-                        st.write(doc.page_content)
     else:
-        st.info("Please upload a PDF file to get started.")
 if __name__ == "__main__":
     main()

 from langchain.embeddings import SentenceTransformerEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import os
 import torch
+# Load FLAN-T5 model với các tham số tối ưu
 @st.cache_resource
 def load_llm():
     model_name = "google/flan-t5-xl"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_name,
+        torch_dtype=torch.float32,
         device_map="auto"
     )
         "text2text-generation",
         model=model,
         tokenizer=tokenizer,
+        max_new_tokens=512,
+        temperature=0.6,
+        top_k=50,
+        top_p=0.85,
+        repetition_penalty=1.2,
+        num_beams=3,
+        early_stopping=True,
         do_sample=True
     )
     return HuggingFacePipeline(pipeline=pipe)
+# Xử lý PDF và tạo vector store
 def process_pdf(pdf_path):
     loader = PyPDFLoader(pdf_path)
     documents = loader.load()
+    text_splitter = CharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        separator="\n"
+    )
     texts = text_splitter.split_documents(documents)
     embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
     vectorstore = FAISS.from_documents(texts, embeddings)
     return vectorstore
+# Xử lý hậu kỳ cho câu trả lời
+def postprocess_answer(answer):
+    # Thay thế các cụm từ không tự nhiên
+    replacements = {
+        "the context": "tài liệu",
+        "according to the document": "theo nội dung tài liệu",
+        "it is stated that": "trong tài liệu có đề cập rằng",
+        "the answer is": "câu trả lời là",
+        "based on the information": "dựa trên thông tin được cung cấp"
+    }
+    for eng, vi in replacements.items():
+        answer = answer.replace(eng, vi)
+    # Chuẩn hóa định dạng
+    answer = answer.strip()
+    if answer and len(answer) > 0:
+        answer = answer[0].upper() + answer[1:]
+    # Kiểm tra câu trả lời ngắn
+    if len(answer.split()) < 4:
+        answer = "Thông tin này hiện chưa rõ ràng. " + answer
+    return answer
+# Prompt template tiếng Việt
+template = """Hãy trả lời câu hỏi một cách tự nhiên và mạch lạc như con người.
+Sử dụng ngôn từ dễ hiểu, tránh các thuật ngữ kỹ thuật phức tạp.
+Nếu không có thông tin trong tài liệu, hãy trả lời 'Tôi không tìm thấy thông tin liên quan trong tài liệu'.
+Câu hỏi: {query}
+Trả lời:"""
+QA_PROMPT = PromptTemplate.from_template(template)
 def main():
+    st.set_page_config(page_title="Trợ lý PDF thông minh", page_icon="📘")
+    st.title("Trợ lý PDF thông minh 🤖")
+    st.markdown("Tải lên file PDF và đặt câu hỏi về nội dung tài liệu!")
+    uploaded_file = st.file_uploader("Chọn file PDF", type="pdf")
     if uploaded_file is not None:
+        # Lưu file tạm
         with open("temp.pdf", "wb") as f:
             f.write(uploaded_file.getbuffer())
+        # Xử lý PDF
+        with st.spinner("Đang phân tích tài liệu..."):
             vectorstore = process_pdf("temp.pdf")
+        # Khởi tạo model
         llm = load_llm()
+        # Tạo QA chain với prompt template
         qa_chain = RetrievalQA.from_chain_type(
             llm=llm,
             chain_type="stuff",
             retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
+            return_source_documents=True,
+            chain_type_kwargs={"prompt": QA_PROMPT}
         )
+        # Giao diện hỏi đáp
+        query = st.text_input("Nhập câu hỏi của bạn về tài liệu:")
         if query:
+            with st.spinner("Đang tổng hợp câu trả lời..."):
+                try:
+                    result = qa_chain({"query": query})
+                    raw_answer = result["result"]
+                    answer = postprocess_answer(raw_answer)
+                    st.markdown("### Câu trả lời")
+                    st.success(answer)
+                    with st.expander("Xem chi tiết nguồn tham khảo"):
+                        for i, doc in enumerate(result["source_documents"]):
+                            st.markdown(f"**Trích dẫn {i+1}:**")
+                            st.info(doc.page_content[:500] + "...")
+                except Exception as e:
+                    st.error("Có lỗi xảy ra khi xử lý yêu cầu. Vui lòng thử lại với câu hỏi khác.")
     else:
+        st.info("Vui lòng tải lên file PDF để bắt đầu.")
 if __name__ == "__main__":
     main()