Files changed (1) hide show
  1. app.py +99 -59
app.py CHANGED
@@ -2,28 +2,41 @@ import streamlit as st
2
  import pandas as pd
3
  import os
4
 
5
- # LangChain
 
 
6
  from langchain_community.document_loaders import PyPDFLoader, TextLoader
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain_community.vectorstores import FAISS
 
10
  from langchain.chains import RetrievalQA
 
11
 
12
- # Local LLM
13
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
14
  from langchain_community.llms import HuggingFacePipeline
15
 
16
- # Charts
17
  import plotly.express as px
18
 
19
  # -------------------------------
20
- # CONFIG
21
  # -------------------------------
22
  st.set_page_config(page_title="Offline GPT RAG", layout="wide")
23
- st.title("πŸ€– Offline ChatGPT-like RAG + πŸ“Š Dashboard")
 
 
 
 
 
 
 
 
 
24
 
25
  # -------------------------------
26
- # CACHE MODEL (IMPORTANT ⚑)
27
  # -------------------------------
28
  @st.cache_resource
29
  def load_llm():
@@ -42,9 +55,9 @@ def load_llm():
42
  return HuggingFacePipeline(pipeline=pipe)
43
 
44
  # -------------------------------
45
- # LOAD DOCS
46
  # -------------------------------
47
- def load_docs(files):
48
  docs = []
49
  stats = []
50
 
@@ -58,26 +71,26 @@ def load_docs(files):
58
 
59
  if file.name.endswith(".pdf"):
60
  loader = PyPDFLoader(path)
61
- ftype = "PDF"
62
  else:
63
  loader = TextLoader(path)
64
- ftype = "TXT"
65
 
66
- loaded = loader.load()
67
- docs.extend(loaded)
68
 
69
  stats.append({
70
  "File": file.name,
71
- "Type": ftype,
72
- "Pages": len(loaded)
73
  })
74
 
75
  return docs, pd.DataFrame(stats)
76
 
77
  # -------------------------------
78
- # SPLIT
79
  # -------------------------------
80
- def split_docs(docs):
81
  splitter = RecursiveCharacterTextSplitter(
82
  chunk_size=400,
83
  chunk_overlap=50
@@ -87,43 +100,42 @@ def split_docs(docs):
87
  # -------------------------------
88
  # VECTOR STORE
89
  # -------------------------------
90
- @st.cache_resource
91
- def load_embeddings():
92
- return HuggingFaceEmbeddings(
93
- model_name="sentence-transformers/all-MiniLM-L6-v2"
94
- )
95
-
96
  def create_vectorstore(chunks):
97
- return FAISS.from_documents(chunks, load_embeddings())
 
98
 
99
  # -------------------------------
100
- # QA CHAIN (BETTER PROMPT)
101
  # -------------------------------
102
  def build_qa(vs):
103
  llm = load_llm()
104
 
105
- prompt_template = """
106
- You are an intelligent assistant.
107
- Answer ONLY from the provided context.
108
- If the answer is not in the context, say "Not found in document".
 
109
 
110
- Context:
111
- {context}
112
 
113
- Question:
114
- {question}
115
 
116
- Answer:
117
- """
 
 
118
 
119
  return RetrievalQA.from_chain_type(
120
  llm=llm,
121
  retriever=vs.as_retriever(search_kwargs={"k": 3}),
122
- chain_type_kwargs={"prompt": prompt_template}
 
123
  )
124
 
125
  # -------------------------------
126
- # SESSION
127
  # -------------------------------
128
  if "qa" not in st.session_state:
129
  st.session_state.qa = None
@@ -132,56 +144,84 @@ if "history" not in st.session_state:
132
  st.session_state.history = []
133
 
134
  # -------------------------------
135
- # UPLOAD
136
  # -------------------------------
137
- files = st.file_uploader("Upload PDF/TXT", accept_multiple_files=True)
 
 
 
138
 
139
  # -------------------------------
140
- # PROCESS
141
  # -------------------------------
142
  if files and st.session_state.qa is None:
143
- with st.spinner("Processing..."):
144
- docs, df = load_docs(files)
145
- chunks = split_docs(docs)
146
  vs = create_vectorstore(chunks)
147
  qa = build_qa(vs)
148
 
149
  st.session_state.qa = qa
150
  st.session_state.df = df
151
- st.session_state.doc_count = len(docs)
152
- st.session_state.chunk_count = len(chunks)
153
 
154
- st.success("βœ… Ready!")
155
 
156
  # -------------------------------
157
  # DASHBOARD
158
  # -------------------------------
159
  if st.session_state.qa:
160
- st.subheader("πŸ“Š Analytics")
161
 
162
  df = st.session_state.df
163
 
164
- st.metric("Docs", st.session_state.doc_count)
165
- st.metric("Chunks", st.session_state.chunk_count)
 
 
166
 
167
- st.plotly_chart(px.bar(df, x="File", y="Pages", color="Type"))
168
- st.plotly_chart(px.pie(df, names="Type"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  # -------------------------------
171
- # CHAT
172
  # -------------------------------
 
 
173
  query = st.text_input("Ask your question")
174
 
175
  if query and st.session_state.qa:
176
- result = st.session_state.qa.invoke({"query": query})
177
- answer = result["result"]
 
 
 
178
 
179
- st.session_state.history.append((query, answer))
 
180
 
181
  # -------------------------------
182
- # HISTORY
183
  # -------------------------------
184
- for q, a in reversed(st.session_state.history):
185
- st.markdown(f"**Q:** {q}")
186
- st.markdown(f"**A:** {a}")
187
- st.markdown("---")
 
 
 
 
2
  import pandas as pd
3
  import os
4
 
5
+ # -------------------------------
6
+ # LANGCHAIN IMPORTS (NEW STYLE)
7
+ # -------------------------------
8
  from langchain_community.document_loaders import PyPDFLoader, TextLoader
9
  from langchain_text_splitters import RecursiveCharacterTextSplitter
10
  from langchain_community.embeddings import HuggingFaceEmbeddings
11
  from langchain_community.vectorstores import FAISS
12
+
13
  from langchain.chains import RetrievalQA
14
+ from langchain.prompts import PromptTemplate
15
 
16
+ # Local LLM (NO API, NO TRANSFORMERS PIPELINE)
17
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
18
  from langchain_community.llms import HuggingFacePipeline
19
 
20
+ # Dashboard
21
  import plotly.express as px
22
 
23
  # -------------------------------
24
+ # STREAMLIT CONFIG
25
  # -------------------------------
26
  st.set_page_config(page_title="Offline GPT RAG", layout="wide")
27
+ st.title("πŸ€– ChatGPT-like RAG (Offline) + πŸ“Š Dashboard")
28
+
29
+ # -------------------------------
30
+ # CACHE EMBEDDINGS
31
+ # -------------------------------
32
+ @st.cache_resource
33
+ def load_embeddings():
34
+ return HuggingFaceEmbeddings(
35
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
36
+ )
37
 
38
  # -------------------------------
39
+ # LOAD LOCAL LLM (STABLE FIX)
40
  # -------------------------------
41
  @st.cache_resource
42
  def load_llm():
 
55
  return HuggingFacePipeline(pipeline=pipe)
56
 
57
  # -------------------------------
58
+ # LOAD DOCUMENTS
59
  # -------------------------------
60
+ def load_documents(files):
61
  docs = []
62
  stats = []
63
 
 
71
 
72
  if file.name.endswith(".pdf"):
73
  loader = PyPDFLoader(path)
74
+ file_type = "PDF"
75
  else:
76
  loader = TextLoader(path)
77
+ file_type = "TXT"
78
 
79
+ loaded_docs = loader.load()
80
+ docs.extend(loaded_docs)
81
 
82
  stats.append({
83
  "File": file.name,
84
+ "Type": file_type,
85
+ "Pages": len(loaded_docs)
86
  })
87
 
88
  return docs, pd.DataFrame(stats)
89
 
90
  # -------------------------------
91
+ # SPLIT DOCUMENTS
92
  # -------------------------------
93
+ def split_documents(docs):
94
  splitter = RecursiveCharacterTextSplitter(
95
  chunk_size=400,
96
  chunk_overlap=50
 
100
  # -------------------------------
101
  # VECTOR STORE
102
  # -------------------------------
 
 
 
 
 
 
103
  def create_vectorstore(chunks):
104
+ embeddings = load_embeddings()
105
+ return FAISS.from_documents(chunks, embeddings)
106
 
107
  # -------------------------------
108
+ # QA CHAIN (FIXED PROMPT ERROR)
109
  # -------------------------------
110
  def build_qa(vs):
111
  llm = load_llm()
112
 
113
+ prompt = PromptTemplate(
114
+ template="""
115
+ You are an intelligent assistant.
116
+ Answer ONLY using the given context.
117
+ If answer is not found, say "Not found in document".
118
 
119
+ Context:
120
+ {context}
121
 
122
+ Question:
123
+ {question}
124
 
125
+ Answer:
126
+ """,
127
+ input_variables=["context", "question"]
128
+ )
129
 
130
  return RetrievalQA.from_chain_type(
131
  llm=llm,
132
  retriever=vs.as_retriever(search_kwargs={"k": 3}),
133
+ chain_type="stuff",
134
+ chain_type_kwargs={"prompt": prompt}
135
  )
136
 
137
  # -------------------------------
138
+ # SESSION STATE
139
  # -------------------------------
140
  if "qa" not in st.session_state:
141
  st.session_state.qa = None
 
144
  st.session_state.history = []
145
 
146
  # -------------------------------
147
+ # UPLOAD FILES
148
  # -------------------------------
149
+ files = st.file_uploader(
150
+ "Upload PDF / TXT files",
151
+ accept_multiple_files=True
152
+ )
153
 
154
  # -------------------------------
155
+ # PROCESS PIPELINE
156
  # -------------------------------
157
  if files and st.session_state.qa is None:
158
+ with st.spinner("Processing documents..."):
159
+ docs, df = load_documents(files)
160
+ chunks = split_documents(docs)
161
  vs = create_vectorstore(chunks)
162
  qa = build_qa(vs)
163
 
164
  st.session_state.qa = qa
165
  st.session_state.df = df
166
+ st.session_state.docs = len(docs)
167
+ st.session_state.chunks = len(chunks)
168
 
169
+ st.success("βœ… Ready! Ask questions now.")
170
 
171
  # -------------------------------
172
  # DASHBOARD
173
  # -------------------------------
174
  if st.session_state.qa:
175
+ st.subheader("πŸ“Š Analytics Dashboard")
176
 
177
  df = st.session_state.df
178
 
179
+ col1, col2, col3 = st.columns(3)
180
+ col1.metric("πŸ“„ Documents", st.session_state.docs)
181
+ col2.metric("🧩 Chunks", st.session_state.chunks)
182
+ col3.metric("πŸ“ Files", len(df))
183
 
184
+ # Bar chart
185
+ fig1 = px.bar(df, x="File", y="Pages", color="Type", title="Pages per File")
186
+ st.plotly_chart(fig1, use_container_width=True)
187
+
188
+ # Pie chart
189
+ fig2 = px.pie(df, names="Type", title="File Type Distribution")
190
+ st.plotly_chart(fig2, use_container_width=True)
191
+
192
+ # Growth chart
193
+ growth = pd.DataFrame({
194
+ "Stage": ["Documents", "Chunks"],
195
+ "Count": [st.session_state.docs, st.session_state.chunks]
196
+ })
197
+
198
+ fig3 = px.line(growth, x="Stage", y="Count", markers=True, title="Processing Growth")
199
+ st.plotly_chart(fig3, use_container_width=True)
200
 
201
  # -------------------------------
202
+ # CHAT SECTION
203
  # -------------------------------
204
+ st.subheader("πŸ€– Chat with Documents")
205
+
206
  query = st.text_input("Ask your question")
207
 
208
  if query and st.session_state.qa:
209
+ with st.spinner("Thinking..."):
210
+ result = st.session_state.qa.invoke({"query": query})
211
+ answer = result["result"]
212
+
213
+ st.session_state.history.append((query, answer))
214
 
215
+ st.markdown("### 🧠 Answer")
216
+ st.write(answer)
217
 
218
  # -------------------------------
219
+ # CHAT HISTORY
220
  # -------------------------------
221
+ if st.session_state.history:
222
+ st.subheader("πŸ’¬ Chat History")
223
+
224
+ for q, a in reversed(st.session_state.history):
225
+ st.markdown(f"**Q:** {q}")
226
+ st.markdown(f"**A:** {a}")
227
+ st.markdown("---")