uyen13 commited on
Commit
991d23a
·
verified ·
1 Parent(s): 3ca3d47

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -32
app.py CHANGED
@@ -7,12 +7,12 @@ from langchain.vectorstores import FAISS
7
  from langchain.chains import RetrievalQA
8
  from langchain.prompts import PromptTemplate
9
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
10
- import os
11
  import torch
12
 
 
13
  @st.cache_resource
14
  def load_llm():
15
- model_name = "google/flan-t5-xl"
16
 
17
  tokenizer = AutoTokenizer.from_pretrained(model_name)
18
  model = AutoModelForSeq2SeqLM.from_pretrained(
@@ -26,17 +26,19 @@ def load_llm():
26
  model=model,
27
  tokenizer=tokenizer,
28
  max_new_tokens=512,
29
- temperature=0.6,
 
30
  top_k=50,
31
- top_p=0.85,
32
  repetition_penalty=1.2,
33
- num_beams=3,
34
  early_stopping=True,
35
  do_sample=True
36
  )
37
-
38
  return HuggingFacePipeline(pipeline=pipe)
39
 
 
 
40
  def process_pdf(pdf_path):
41
  loader = PyPDFLoader(pdf_path)
42
  documents = loader.load()
@@ -48,37 +50,15 @@ def process_pdf(pdf_path):
48
  )
49
  texts = text_splitter.split_documents(documents)
50
 
51
- # Sử dụng model embedding đa ngôn ngữ
52
  embeddings = SentenceTransformerEmbeddings(model_name="paraphrase-multilingual-mpnet-base-v2")
53
  vectorstore = FAISS.from_documents(texts, embeddings)
54
  return vectorstore
55
 
56
- def postprocess_answer(answer):
57
- # Thay thế các cụm từ không tự nhiên trong tiếng Nhật
58
- replacements = {
59
- "the context": "ドキュメント",
60
- "according to the document": "文書によりますと",
61
- "it is stated that": "記載されている内容では",
62
- "the answer is": "答えは",
63
- "based on the information": "提供された情報に基づきますと"
64
- }
65
-
66
- for eng, jp in replacements.items():
67
- answer = answer.replace(eng, jp)
68
-
69
- # Chuẩn hóa định dạng tiếng Nhật
70
- answer = answer.strip()
71
- if answer and len(answer) > 0:
72
- answer = answer[0].upper() + answer[1:]
73
-
74
- # Kiểm tra câu trả lời ngắn
75
- if len(answer.split()) < 4:
76
- answer = "情報が不足しているようです。 " + answer
77
-
78
- return answer
79
 
80
- # Prompt template tiếng Nhật
81
- template = """以下の内容に基づいて質問に自然な日本語で回答してください:
 
 
82
  {context}
83
 
84
  質問: {question}
@@ -89,6 +69,30 @@ QA_PROMPT = PromptTemplate(
89
  input_variables=["context", "question"]
90
  )
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def main():
93
  st.set_page_config(page_title="PDFアシスタント", page_icon="📘")
94
  st.title("PDFアシスタント 🤖")
@@ -139,5 +143,6 @@ def main():
139
  else:
140
  st.info("PDFファイルをアップロードしてください")
141
 
 
142
  if __name__ == "__main__":
143
  main()
 
7
  from langchain.chains import RetrievalQA
8
  from langchain.prompts import PromptTemplate
9
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 
10
  import torch
11
 
12
+ # --- Load mô hình ngôn ngữ ---
13
  @st.cache_resource
14
  def load_llm():
15
+ model_name = "google/flan-t5-xl" # Có thể thay bằng google/flan-ul2 hoặc mistralai/Mistral-7B-Instruct-v0.2 nếu có GPU
16
 
17
  tokenizer = AutoTokenizer.from_pretrained(model_name)
18
  model = AutoModelForSeq2SeqLM.from_pretrained(
 
26
  model=model,
27
  tokenizer=tokenizer,
28
  max_new_tokens=512,
29
+ temperature=0.7,
30
+ top_p=0.9,
31
  top_k=50,
 
32
  repetition_penalty=1.2,
33
+ num_beams=4,
34
  early_stopping=True,
35
  do_sample=True
36
  )
37
+
38
  return HuggingFacePipeline(pipeline=pipe)
39
 
40
+
41
+ # --- Xử lý file PDF ---
42
  def process_pdf(pdf_path):
43
  loader = PyPDFLoader(pdf_path)
44
  documents = loader.load()
 
50
  )
51
  texts = text_splitter.split_documents(documents)
52
 
 
53
  embeddings = SentenceTransformerEmbeddings(model_name="paraphrase-multilingual-mpnet-base-v2")
54
  vectorstore = FAISS.from_documents(texts, embeddings)
55
  return vectorstore
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ # --- Tiền xử lý prompt và hậu xử lý câu trả lời ---
59
+ template = """以下の文書情報をもとに、質問に自然で丁寧な日本語で回答してください。できるだけ具体的に、例を挙げて分かりやすく説明してください。
60
+
61
+ 文書情報:
62
  {context}
63
 
64
  質問: {question}
 
69
  input_variables=["context", "question"]
70
  )
71
 
72
+
73
+ def postprocess_answer(answer):
74
+ answer = answer.strip()
75
+
76
+ # Loại bỏ phần đầu không cần thiết
77
+ if "Answer:" in answer:
78
+ answer = answer.split("Answer:")[-1].strip()
79
+
80
+ # Thêm dấu chấm cuối câu nếu thiếu
81
+ if answer and answer[-1] not in "。.?!":
82
+ answer += "。"
83
+
84
+ # Viết hoa chữ cái đầu tiên
85
+ if len(answer) > 0:
86
+ answer = answer[0].upper() + answer[1:]
87
+
88
+ # Kiểm tra xem câu có quá ngắn không
89
+ if len(answer.split()) < 3:
90
+ answer = "ご参考までに、提供された資料にはその点についての詳細な記載が見受けられませんが、" + answer
91
+
92
+ return answer
93
+
94
+
95
+ # --- Giao diện chính của ứng dụng ---
96
  def main():
97
  st.set_page_config(page_title="PDFアシスタント", page_icon="📘")
98
  st.title("PDFアシスタント 🤖")
 
143
  else:
144
  st.info("PDFファイルをアップロードしてください")
145
 
146
+
147
  if __name__ == "__main__":
148
  main()