Mr-Hsu commited on
Commit
0a7840a
·
verified ·
1 Parent(s): b81cb86

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -281
app.py DELETED
@@ -1,281 +0,0 @@
1
- import os
2
- import uuid
3
- import fitz # pymupdf
4
- import streamlit as st
5
- from typing import List, Tuple
6
- import pdfkit
7
- import json
8
- from fpdf import FPDF
9
- from langchain_community.document_loaders import PyPDFLoader
10
- from langchain_text_splitters import RecursiveCharacterTextSplitter
11
- from langchain_community.vectorstores import Chroma
12
- from langchain_ollama import OllamaEmbeddings
13
- from langchain_community.embeddings import HuggingFaceEmbeddings
14
- from langchain_core.prompts import ChatPromptTemplate
15
- from langchain_ollama.llms import OllamaLLM
16
- from openai import OpenAI
17
- #from openai import OpenAI
18
- #client = OpenAI(api_key="<DeepSeek API Key>", base_url="https://api.deepseek.com")
19
- #response = client.chat.completions.create(
20
- # model="deepseek-chat",
21
- # messages=[
22
- # {"role": "system", "content": "You are a helpful assistant"},
23
- # {"role": "user", "content": "Hello"},
24
- # ],
25
- # stream=False
26
- #)
27
- #print(response.choices[0].message.content)
28
-
29
- # ========== Configuration ==========
30
- class Config:
31
- UPLOAD_DIR = os.path.join("data", "uploads")
32
- CHROMA_BASE = "chroma_db"
33
- TEMPLATES = {
34
- "chat": """[INST] You're a concise AI assistant. Keep answer in detail, clear and concise. The answer contains equations, mathematical derivation process, necessary references and monographs.:
35
- Question: {question}
36
- Context: {context}
37
- Answer: [/INST]""",
38
- "summary": """[INST] Summarize key points including equations, mathematical derivation process, necessary references and monographs from:
39
- Context: {context}
40
- Summary: [/INST]"""
41
- }
42
-
43
- SPLITTER_CONFIG = {
44
- "chunk_size": 1024,
45
- "chunk_overlap": 256,
46
- "separators": ["\n\n", "\n", r"(?<=[.!?])\s+"]
47
- }
48
-
49
- config = Config()
50
- os.makedirs(config.UPLOAD_DIR, exist_ok=True)
51
-
52
-
53
-
54
- # ========== Core Services ==========
55
- class DocumentProcessor:
56
- """Handles PDF processing and vector store operations"""
57
-
58
- def __init__(self, embeddings, model_name):
59
- self.embeddings = embeddings
60
- self.model_name = model_name
61
- self.text_splitter = RecursiveCharacterTextSplitter(**config.SPLITTER_CONFIG)
62
-
63
- @st.cache_resource(show_spinner=False)
64
- def _process_pdfs(_self, files: List) -> Tuple[Chroma, List]:
65
- """Process PDFs into vector store with cache invalidation"""
66
- docs = []
67
- for file in files:
68
- file_path = _self._save_temp_file(file)
69
- docs.extend(PyPDFLoader(file_path).load_and_split(_self.text_splitter))
70
- os.remove(file_path)
71
-
72
- vector_store = Chroma.from_documents(collection_name="pdf_docs", documents=docs, embedding=_self.embeddings, persist_directory=os.path.join(_self._chroma_path,"_temp"))
73
- #if os.path.exists(_self._chroma_path):
74
- # Update database
75
- # vector_store = Chroma(collection_name="pdf_docs", embedding_function=_self.embeddings, persist_directory=_self._chroma_path)
76
- # vector_store.add_documents(docs)
77
- #else:
78
- # vector_store = Chroma.from_documents(collection_name="pdf_docs", documents=docs, embedding=_self.embeddings, persist_directory=_self._chroma_path)
79
-
80
- return vector_store, docs
81
-
82
- @st.cache_resource(show_spinner=False)
83
- def _load_database(_self) -> Chroma:
84
- """Load Database and return vector store"""
85
- vector_store = Chroma(collection_name="pdf_docs", embedding_function=_self.embeddings, persist_directory=_self._chroma_path)
86
- return vector_store
87
-
88
- @property
89
- def _chroma_path(self) -> str:
90
- return os.path.join(config.CHROMA_BASE, self.model_name.replace(":", "_"))
91
-
92
- def _save_temp_file(self, file) -> str:
93
- """Save uploaded file with UUID and return path"""
94
- file_path = os.path.join(config.UPLOAD_DIR, f"temp_{uuid.uuid4()}.pdf")
95
- with open(file_path, "wb") as f:
96
- f.write(file.getbuffer())
97
- return file_path
98
-
99
- # Function to generate a PDF with summary and topic
100
- def create_pdf(output_txt, original_file_name) -> str:
101
- base_name = os.path.splitext(original_file_name)[0] # Remove the .pdf extension
102
- pdf_file_name = f"{base_name} summary.pdf" # Create the new filename
103
-
104
- pdf = FPDF()
105
- pdf.add_page()
106
- pdf.set_font("Arial", size=12)
107
-
108
- pdf.cell(200, 10, txt="Summary", ln=True, align='C')
109
- pdf.multi_cell(0, 10, txt=summary)
110
-
111
- # Save the PDF to a file in memory
112
- pdf_file_path = f"tmp/{pdf_file_name}"
113
- pdf.output(pdf_file_path)
114
-
115
- return pdf_file_path
116
-
117
- class ChatService:
118
- """Handles chat interactions and responses"""
119
-
120
- def __init__(self, api_key, deepseek_reasoner=False, max_tokens=2048, temperature=1.0, frequency_penalty=0.0, presence_penalty=0.0, logprobs=False, top_logprobs=None, vector_store=None):
121
- self.api_key = api_key
122
- self.deepseek_reasoner = deepseek_reasoner
123
- self.max_tokens = max_tokens
124
- self.temperature = temperature
125
- self.frequency_penalty = frequency_penalty
126
- self.presence_penalty = presence_penalty
127
- self.logprobs = logprobs
128
- self.top_logprobs = top_logprobs
129
- self.vector_store = vector_store
130
-
131
- def generate_response(self, query: str, template_key: str) -> str:
132
- """Generic response generator for different templates"""
133
-
134
- context = self._get_context(query) if self.vector_store else query
135
- prompt = ChatPromptTemplate.from_template(config.TEMPLATES[template_key])
136
- try:
137
- client = OpenAI(api_key=self.api_key, base_url="https://api.deepseek.com")
138
- prompt_online = [{"role": "system", "content": "You're a concise AI assistant. Keep answer in detail, clear and concise. The answer contains equations, mathematical derivation process, necessary references and monographs."},{"role": "user", "content": context}]
139
- if not self.deepseek_reasoner: # deepseek-chat
140
- response = client.chat.completions.create(
141
- messages=prompt_online,
142
- model="deepseek-chat",frequency_penalty=self.frequency_penalty, max_tokens=self.max_tokens,presence_penalty=self.presence_penalty, response_format={'type': 'text'},stop=None,stream=False, stream_options=None,temperature=self.temperature,top_p=1,tools=None,tool_choice=None, logprobs=self.logprobs,top_logprobs=self.top_logprobs
143
- ) #json_object
144
- return response.choices[0].message.content
145
- else: #deepseek-reasoner
146
- response = client.chat.completions.create(
147
- messages=prompt_online,
148
- model="deepseek-reasoner",max_tokens=self.max_tokens,response_format={'type': 'text'}, stop=None,stream=False,stream_options=None,tools=None,tool_choice=None
149
- ) #json_object
150
- return response.choices[0].message.reasoning_content
151
- except Exception as e:
152
- st.error(f"DeepSeek Online is not available now, Processing locally: {str(e)}")
153
- llm = OllamaLLM(model=model_name, temperature=temperature, frequency_penalty=frequency_penalty, max_tokens=max_tokens, presence_penalty=presence_penalty, response_format={'type': 'text'},stop=None,stream=False, stream_options=None,top_p=1,tools=None,tool_choice=None, logprobs=top_logprobs_logical,top_logprobs=top_logprobs,)
154
- return (prompt | llm).invoke({"question": query, "context": context})
155
-
156
- def _get_context(self, query: str) -> str:
157
- """Retrieve relevant context from vector store"""
158
- # Useful if your dataset has many similar documents # Fetch more documents for the MMR algorithm to consider # But only return the top 5
159
- #lambda_mult: Diversity of results returned by MMR; 1 for minimum diversity and 0 for maximum. (Default: 0.5)
160
- #docs = vector_store.similarity_search(question, k=3)
161
- #docs = vector_store.similarity_search_with_score(question, k=3)
162
- docs = self.vector_store.max_marginal_relevance_search(
163
- query, k=15, fetch_k=30, lambda_mult=0.6
164
- )
165
- return "\n\n".join(d.page_content for d in docs[:5]) # First 5 chunks
166
-
167
- # ========== UI Components ==========
168
- def setup_sidebar() -> Tuple[bool, str, List]:
169
- """Configure and return sidebar components"""
170
- with st.sidebar:
171
- st.subheader("⚙️ Settings")
172
- deepseek_reasoner = st.toggle("DeepSeek Reasoner")
173
- model_name = st.radio(
174
- "Model Selection:",
175
- ["deepseek-r1:1.5b", "deepseek-r1:7b", "deepseek-r1:8b", "deepseek-r1:14b", "deepseek-r1:32b"],
176
- horizontal=True
177
- )
178
- st.divider()
179
- #max_tokens = st.slider("Max Tokens:", 1, 8192, 2048)
180
- max_tokens = st.select_slider("Max Tokens:", options=[128, 256, 512, 1024, 2048, 4096, 8192],value=(2048))
181
- top_logprobs = st.slider("Log Probabilities of Each Output Token:", 0, 20, 1)
182
- if top_logprobs==0:
183
- top_logprobs_logical = False
184
- top_logprobs = None
185
- else:
186
- top_logprobs_logical = True
187
-
188
- temperature = st.slider("Creativity Level:", 0.0, 2.0, 0.8)
189
- frequency_penalty = st.slider("Decreasing Repeated Topics:", -2.0, 2.0, 0.0)
190
- presence_penalty = st.slider("Increasing New Topics:", -2.0, 2.0, 0.0)
191
-
192
- files = st.file_uploader("Upload PDFs", type="pdf", accept_multiple_files=True)
193
- if files:
194
- show_pdf_preview(files)
195
- return deepseek_reasoner, model_name, max_tokens, top_logprobs, top_logprobs_logical, temperature,frequency_penalty, presence_penalty, files
196
-
197
- def show_pdf_preview(file) -> None:
198
- """Display PDF first page preview"""
199
- try:
200
- #with pymupdf.open(stream=file[0].getvalue()) as doc:
201
- with fitz.open(stream=file[0].getvalue()) as doc:
202
- pix = doc[0].get_pixmap()
203
- st.image(pix.tobytes(), caption="First Page Preview", use_container_width=True)
204
- except Exception as e:
205
- st.error(f"Preview error: {str(e)}")
206
-
207
- @st.cache_resource(show_spinner=False)
208
- def get_embedder():
209
- """Cache the embedding model to avoid reloading on every run."""
210
- return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
211
-
212
- # ========== Main Application ==========
213
- def main():
214
- st.set_page_config(layout="wide", page_title="🚀 PDF AI Assistant")
215
- st.title("🚀 Intelligent PDF Analysis Suite")
216
-
217
- DeepSeek_API_KEY = st.text_input(
218
- "🔑 Enter your DeepSeek API Key (sk-...):", "", type="password"
219
- )
220
-
221
- deepseek_reasoner, model_name, max_tokens, top_logprobs, top_logprobs_logical, temperature,frequency_penalty, presence_penalty, files = setup_sidebar()
222
-
223
- # Initialize core services
224
- #embeddings = OllamaEmbeddings(model=model_name)
225
- embeddings = get_embedder()
226
- processor = DocumentProcessor(embeddings, model_name)
227
- chat_service = ChatService(DeepSeek_API_KEY, deepseek_reasoner, max_tokens, temperature, frequency_penalty, presence_penalty, top_logprobs_logical, top_logprobs)
228
-
229
- # Main interface
230
- tab_db, tab_doc, tab_sum = st.tabs(["💬 Database Chat", "📄 Document Analysis", "📑 Smart Summary"])
231
-
232
- with tab_db:
233
- try:
234
- with st.spinner("🔄 Loding knowledge base..."):
235
- vector_store_db = processor._load_database()
236
- st.success("🏛️Database ready!")
237
- except Exception as e:
238
- vector_store_db, _ = processor._process_pdfs("./temp.pdf")
239
- st.error("No Database, Please Upload a PDF file and Update Basedata firstly: " + str(e))
240
-
241
- chat_service.vector_store = vector_store_db
242
- st.info("Query existing knowledge base")
243
- if query := st.chat_input("Ask about the database ..."):
244
- st.write("📜You ask:", query)
245
- with st.spinner("🔄 Analyzing..."):
246
- response = chat_service.generate_response(query, "chat")
247
- st.chat_message("assistant").write(response)
248
- if files:
249
- #show_pdf_preview(files)
250
- with st.spinner("🔄 Processing documents..."):
251
- vector_store, docs = processor._process_pdfs(files)
252
- chat_service.vector_store = vector_store
253
-
254
- with tab_doc:
255
- if query := st.chat_input("Ask about the document..."):
256
- st.write("📜You ask:", query)
257
- with st.spinner("🔄 Analyzing..."):
258
- response = chat_service.generate_response(query, "chat")
259
- st.chat_message("assistant").write(response)
260
- if st.button("Update Database", type="primary", key=0):
261
- with st.spinner("🔄 Updating Database..."):
262
- vector_store_db = processor._load_database()
263
- vector_store_db.add_documents(docs)
264
- st.write("### 🏛️ Database Was Updated !!!")
265
-
266
- with tab_sum:
267
- if st.button("Generate Smart Summary", type="primary", key=1):
268
- with st.spinner("🤖Distilling key insights..."):
269
- summary = chat_service.generate_response("", "summary")
270
- st.markdown(f"**Document Summary:**\n{summary}")
271
-
272
- if st.button("Update Database", type="primary", key=2):
273
- with st.spinner("🔄 Updating Database..."):
274
- vector_store_db = processor._load_database()
275
- vector_store_db.add_documents(docs)
276
- st.write("### 🏛️ Database Was Updated !!!")
277
- else:
278
- st.info("📥 Please Upload a PDF File to Proceed or Ask Questions with Database.")
279
-
280
- if __name__ == "__main__":
281
- main()