uyen13 commited on
Commit
dddbc2e
·
verified ·
1 Parent(s): f3d30d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -145
app.py CHANGED
@@ -1,151 +1,96 @@
1
  import streamlit as st
2
- from langchain.llms import HuggingFacePipeline
 
3
  from langchain.document_loaders import PyPDFLoader
4
- from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.embeddings import SentenceTransformerEmbeddings
6
  from langchain.vectorstores import FAISS
7
- from langchain.chains import RetrievalQA
8
- from langchain.prompts import PromptTemplate
9
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
10
- import torch
11
 
12
- # --- 1. Load Hình TinyLlama hoặc Mistral ---
13
- @st.cache_resource
14
- def load_llm():
15
- model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Thay bằng "mistralai/Mistral-7B-Instruct-v0.2" nếu có GPU
16
-
17
- tokenizer = AutoTokenizer.from_pretrained(model_name)
18
-
19
- if tokenizer.pad_token is None:
20
- tokenizer.add_special_tokens({'pad_token': '[PAD]'})
21
 
22
- model = AutoModelForCausalLM.from_pretrained(
23
- model_name,
24
- torch_dtype=torch.float32, # Trên CPU nên dùng float32
25
- device_map="auto"
26
- )
27
-
28
- pipe = pipeline(
29
- "text-generation",
30
- model=model,
31
- tokenizer=tokenizer,
32
- max_new_tokens=512,
33
- temperature=0.7,
34
- top_p=0.9,
35
- top_k=50,
36
- repetition_penalty=1.2,
37
- do_sample=True,
38
- eos_token_id=tokenizer.eos_token_id,
39
- truncation=True,
40
- return_full_text=False
41
- )
42
-
43
- return HuggingFacePipeline(pipeline=pipe)
44
-
45
-
46
- # --- 2. Xử lý file PDF ---
47
- def process_pdf(pdf_path):
48
- loader = PyPDFLoader(pdf_path)
49
- documents = loader.load()
50
-
51
- text_splitter = CharacterTextSplitter(
52
- chunk_size=1000,
53
- chunk_overlap=200,
54
- separator="\n"
55
  )
56
- texts = text_splitter.split_documents(documents)
57
-
58
- embeddings = SentenceTransformerEmbeddings(model_name="paraphrase-multilingual-mpnet-base-v2")
59
- vectorstore = FAISS.from_documents(texts, embeddings)
60
- return vectorstore
61
-
62
-
63
- # --- 3. Prompt Template tiếng Nhật (tự nhiên) ---
64
- template = """<s>[INST]あなたは親しみやすく丁寧なアシスタントです。以下の文書情報をもとに、質問に自然で分かりやすい日本語で回答してください。
65
-
66
- - 回答はできるだけ口語的で柔らかい表現を使ってください。
67
- - 理由や例を交えて説明すると良いでしょう。
68
- - 分からない場合は正直に「その点については詳しく記載されていません」と答えてください。
69
-
70
- 文書情報:
71
- {context}
72
-
73
- 質問: {question}
74
- 回答: [/INST]"""
75
-
76
- QA_PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])
77
-
78
-
79
- # --- 4. Hàm hậu xử lý câu trả lời ---
80
- def postprocess_answer(answer):
81
- answer = answer.strip()
82
-
83
- for phrase in ["Answer:", "答え:", "回答:", "The answer is", "Based on the context"]:
84
- answer = answer.replace(phrase, "").strip()
85
-
86
- if answer and len(answer) > 0:
87
- answer = answer[0].upper() + answer[1:]
88
-
89
- if answer and answer[-1] not in "。.?!":
90
- answer += "。"
91
-
92
- if len(answer.split()) < 4:
93
- answer = "資料にはその件についての詳細な記載が見受けられませんが、以下のように推測されます:" + answer
94
-
95
- return answer
96
-
97
-
98
- # --- 5. Giao diện chính của ứng dụng ---
99
- def main():
100
- st.set_page_config(page_title="PDFアシスタント", page_icon="📘")
101
- st.title("PDFアシスタント 🤖")
102
- st.markdown("PDFファイルをアップロードして内容について質問してください")
103
-
104
- uploaded_file = st.file_uploader("PDFファイルを選択", type="pdf")
105
-
106
- if uploaded_file is not None:
107
- with open("temp.pdf", "wb") as f:
108
- f.write(uploaded_file.getbuffer())
109
-
110
- with st.spinner("ドキュメントを分析中..."):
111
- vectorstore = process_pdf("temp.pdf")
112
-
113
- llm = load_llm()
114
-
115
- qa_chain = RetrievalQA.from_chain_type(
116
- llm=llm,
117
- chain_type="stuff",
118
- retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
119
- return_source_documents=True,
120
- input_key="question",
121
- chain_type_kwargs={
122
- "prompt": QA_PROMPT,
123
- "document_variable_name": "context"
124
- }
125
- )
126
-
127
- query = st.text_input("ドキュメントに関する質問を入力:")
128
- if query:
129
- with st.spinner("回答を生成中..."):
130
- try:
131
- result = qa_chain({"question": query})
132
- raw_answer = result["result"]
133
- answer = postprocess_answer(raw_answer)
134
-
135
- st.markdown("### 回答")
136
- st.success(answer)
137
-
138
- with st.expander("参考資料を表示"):
139
- for i, doc in enumerate(result["source_documents"]):
140
- st.markdown(f"**引用 {i+1}:**")
141
- st.info(doc.page_content[:500] + "...")
142
-
143
- except Exception as e:
144
- st.error(f"エラーが発生しました: {str(e)}")
145
-
146
- else:
147
- st.info("PDFファイルをアップロードしてください")
148
-
149
-
150
- if __name__ == "__main__":
151
- main()
 
1
  import streamlit as st
2
+ from streamlit_chat import message
3
+ import tempfile
4
  from langchain.document_loaders import PyPDFLoader
5
+ from langchain.embeddings import HuggingFaceEmbeddings
 
6
  from langchain.vectorstores import FAISS
7
+ from langchain.chains import ConversationalRetrievalChain
8
+ from langchain_g4f import G4FLLM
9
+ from g4f import Provider, models
 
10
 
11
+ # Define the path for generated embeddings
12
+ DB_FAISS_PATH = 'vectorstore/db_faiss'
 
 
 
 
 
 
 
13
 
14
+ # Load LLM using G4F (supports GPT-3.5, etc.)
15
+ def load_llm():
16
+ llm = G4FLLM(
17
+ model=models.gpt_35_turbo,
18
+ provider=Provider.FreeGpt,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  )
20
+ return llm
21
+
22
+ # Hide default Streamlit style
23
+ hide_streamlit_style = """
24
+ <style>
25
+ #MainMenu {visibility: hidden;}
26
+ footer {visibility: hidden;}
27
+ </style>
28
+ """
29
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
30
+
31
+ # Set the title for the Streamlit app
32
+ st.title("📄 PDF Chatbot - Zendo美女チャットボックス")
33
+
34
+ # Upload PDF file
35
+ uploaded_file = st.file_uploader("Tải lên tệp PDF của bạn", type="pdf")
36
+
37
+ if uploaded_file is not None:
38
+ # Save temporary file
39
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
40
+ tmpfile.write(uploaded_file.getvalue())
41
+ tmpfile_path = tmpfile.name
42
+
43
+ # Load PDF using PyPDFLoader
44
+ loader = PyPDFLoader(tmpfile_path)
45
+ pdf_data = loader.load()
46
+
47
+ # Create embeddings and save to FAISS
48
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
49
+ db = FAISS.from_documents(pdf_data, embeddings)
50
+ db.save_local(DB_FAISS_PATH)
51
+
52
+ # Reload LLM and update retrieval chain
53
+ llm = load_llm()
54
+ chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=db.as_retriever())
55
+
56
+ st.success("PDF đã được tải lên và xử lý thành công!")
57
+
58
+ # Function for conversational chat
59
+ def conversational_chat(query):
60
+ result = chain({"question": query, "chat_history": st.session_state['history']})
61
+ st.session_state['history'].append((query, result["answer"]))
62
+ return result["answer"]
63
+
64
+ # Initialize session state for chat history and messages
65
+ if 'history' not in st.session_state:
66
+ st.session_state['history'] = []
67
+
68
+ if 'generated' not in st.session_state:
69
+ st.session_state['generated'] = ["こんにちは!zendo美女です。PDFの内容についてご質問ください... 🤗"]
70
+
71
+ if 'past' not in st.session_state:
72
+ st.session_state['past'] = ["チャットはここから"]
73
+
74
+ # UI Form for user input
75
+ response_container = st.container()
76
+ container = st.container()
77
+
78
+ with container:
79
+ with st.form(key='my_form', clear_on_submit=True):
80
+ user_input = st.text_input("ChatBox", placeholder="質問をご記入ください...", key='input')
81
+ submit_button = st.form_submit_button(label='Send')
82
+
83
+ if submit_button and user_input:
84
+ output = conversational_chat(user_input)
85
+ st.session_state['past'].append(user_input)
86
+ st.session_state['generated'].append(output)
87
+
88
+ # Display chat history
89
+ if st.session_state['generated']:
90
+ with response_container:
91
+ for i in range(len(st.session_state['generated'])):
92
+ message(st.session_state["past"][i], is_user=True, key=str(i) + '_user', avatar_style="big-smile")
93
+ message(st.session_state["generated"][i], key=str(i), avatar_style="thumbs")
94
+
95
+ else:
96
+ st.info("Vui lòng tải lên một tệp PDF để bắt đầu trò chuyện.")