Muthuraja18 commited on
Commit
d13bf3b
Β·
verified Β·
1 Parent(s): 9a2b77a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -63
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import streamlit as st
2
- import tempfile
3
  import os
4
 
5
- # LangChain imports (new structure)
6
  from langchain_community.document_loaders import PyPDFLoader, TextLoader
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
@@ -10,107 +10,199 @@ from langchain_community.vectorstores import FAISS
10
  from langchain_community.llms import HuggingFacePipeline
11
  from langchain.chains import RetrievalQA
12
 
 
13
  from transformers import pipeline
14
 
 
 
15
 
16
  # -------------------------------
17
- # Load Documents (FIXED temp file handling)
18
  # -------------------------------
19
- def load_documents(uploaded_files):
20
- documents = []
21
 
22
- for file in uploaded_files:
23
- suffix = file.name.split(".")[-1]
 
 
 
 
 
 
 
 
 
24
 
25
- with tempfile.NamedTemporaryFile(delete=False, suffix=f".{suffix}") as tmp:
26
- tmp.write(file.getbuffer())
27
- tmp_path = tmp.name
 
 
28
 
29
- if suffix == "pdf":
30
- loader = PyPDFLoader(tmp_path)
31
- else:
32
- loader = TextLoader(tmp_path)
 
 
33
 
34
- documents.extend(loader.load())
 
 
35
 
36
- os.remove(tmp_path) # cleanup
 
37
 
38
- return documents
 
 
 
 
 
 
 
 
39
 
 
 
 
 
 
 
 
40
 
41
  # -------------------------------
42
- # Split Documents
43
  # -------------------------------
44
- def split_documents(documents):
45
  splitter = RecursiveCharacterTextSplitter(
46
  chunk_size=500,
47
  chunk_overlap=50
48
  )
49
- return splitter.split_documents(documents)
50
-
51
 
52
  # -------------------------------
53
- # Create Vector Store
54
  # -------------------------------
55
  def create_vectorstore(chunks):
56
- embeddings = HuggingFaceEmbeddings(
57
- model_name="sentence-transformers/all-MiniLM-L6-v2"
58
- )
59
  return FAISS.from_documents(chunks, embeddings)
60
 
61
-
62
  # -------------------------------
63
- # Load Local LLM (STABLE VERSION)
64
  # -------------------------------
65
- @st.cache_resource
66
- def load_llm():
67
- pipe = pipeline(
68
- task="text2text-generation",
69
- model="google/flan-t5-base",
70
- max_length=512,
71
- do_sample=False
72
  )
73
- return HuggingFacePipeline(pipeline=pipe)
74
 
 
 
 
 
 
 
 
75
 
76
  # -------------------------------
77
- # Build QA Chain
78
  # -------------------------------
79
- def build_qa(vectorstore):
80
- llm = load_llm()
81
- retriever = vectorstore.as_retriever()
82
 
83
- qa = RetrievalQA.from_chain_type(
84
- llm=llm,
85
- retriever=retriever
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  )
87
- return qa
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  # -------------------------------
91
- # Streamlit UI
92
  # -------------------------------
93
- st.set_page_config(page_title="RAG Chatbot", layout="wide")
94
- st.title("πŸ“„ Chat with Your Documents (RAG)")
95
 
96
- uploaded_files = st.file_uploader(
97
- "Upload PDF or TXT files",
98
- accept_multiple_files=True
99
- )
100
 
101
- if uploaded_files:
102
- with st.spinner("Processing documents..."):
103
- docs = load_documents(uploaded_files)
104
- chunks = split_documents(docs)
105
- vectorstore = create_vectorstore(chunks)
106
- qa_chain = build_qa(vectorstore)
107
 
108
- st.success("βœ… Documents ready!")
 
109
 
110
- query = st.text_input("Ask a question from your documents")
 
 
 
 
111
 
112
- if query:
113
- with st.spinner("Generating answer..."):
114
- result = qa_chain.invoke({"query": query})
115
- st.write("### Answer:")
116
- st.write(result["result"])
 
1
  import streamlit as st
2
+ import pandas as pd
3
  import os
4
 
5
+ # LangChain
6
  from langchain_community.document_loaders import PyPDFLoader, TextLoader
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
 
10
  from langchain_community.llms import HuggingFacePipeline
11
  from langchain.chains import RetrievalQA
12
 
13
+ # Transformers
14
  from transformers import pipeline
15
 
16
+ # Charts
17
+ import plotly.express as px
18
 
19
  # -------------------------------
20
+ # PAGE CONFIG
21
  # -------------------------------
22
+ st.set_page_config(page_title="RAG + Analytics", layout="wide")
23
+ st.title("πŸ“„ RAG Chatbot + πŸ“Š Analytics Dashboard")
24
 
25
+ # -------------------------------
26
+ # CACHE (VERY IMPORTANT ⚑)
27
+ # -------------------------------
28
+ @st.cache_resource
29
+ def load_llm():
30
+ pipe = pipeline(
31
+ "text2text-generation",
32
+ model="google/flan-t5-base",
33
+ max_length=512
34
+ )
35
+ return HuggingFacePipeline(pipeline=pipe)
36
 
37
+ @st.cache_resource
38
+ def load_embeddings():
39
+ return HuggingFaceEmbeddings(
40
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
41
+ )
42
 
43
+ # -------------------------------
44
+ # LOAD DOCUMENTS
45
+ # -------------------------------
46
+ def load_documents(files):
47
+ docs = []
48
+ stats = []
49
 
50
+ for file in files:
51
+ path = os.path.join("temp", file.name)
52
+ os.makedirs("temp", exist_ok=True)
53
 
54
+ with open(path, "wb") as f:
55
+ f.write(file.getbuffer())
56
 
57
+ if file.name.endswith(".pdf"):
58
+ loader = PyPDFLoader(path)
59
+ ftype = "PDF"
60
+ else:
61
+ loader = TextLoader(path)
62
+ ftype = "TXT"
63
+
64
+ loaded = loader.load()
65
+ docs.extend(loaded)
66
 
67
+ stats.append({
68
+ "File": file.name,
69
+ "Type": ftype,
70
+ "Pages": len(loaded)
71
+ })
72
+
73
+ return docs, pd.DataFrame(stats)
74
 
75
  # -------------------------------
76
+ # SPLIT DOCUMENTS
77
  # -------------------------------
78
+ def split_docs(docs):
79
  splitter = RecursiveCharacterTextSplitter(
80
  chunk_size=500,
81
  chunk_overlap=50
82
  )
83
+ return splitter.split_documents(docs)
 
84
 
85
  # -------------------------------
86
+ # VECTOR STORE
87
  # -------------------------------
88
  def create_vectorstore(chunks):
89
+ embeddings = load_embeddings()
 
 
90
  return FAISS.from_documents(chunks, embeddings)
91
 
 
92
  # -------------------------------
93
+ # QA CHAIN
94
  # -------------------------------
95
+ def build_qa(vs):
96
+ llm = load_llm()
97
+ return RetrievalQA.from_chain_type(
98
+ llm=llm,
99
+ retriever=vs.as_retriever()
 
 
100
  )
 
101
 
102
+ # -------------------------------
103
+ # FILE UPLOAD
104
+ # -------------------------------
105
+ files = st.file_uploader(
106
+ "Upload PDF / TXT files",
107
+ accept_multiple_files=True
108
+ )
109
 
110
  # -------------------------------
111
+ # SESSION STATE
112
  # -------------------------------
113
+ if "qa" not in st.session_state:
114
+ st.session_state.qa = None
 
115
 
116
+ if "history" not in st.session_state:
117
+ st.session_state.history = []
118
+
119
+ # -------------------------------
120
+ # PROCESS FILES
121
+ # -------------------------------
122
+ if files and st.session_state.qa is None:
123
+ with st.spinner("Processing documents..."):
124
+ docs, df = load_documents(files)
125
+ chunks = split_docs(docs)
126
+ vs = create_vectorstore(chunks)
127
+ qa = build_qa(vs)
128
+
129
+ st.session_state.qa = qa
130
+ st.session_state.df = df
131
+ st.session_state.chunk_count = len(chunks)
132
+ st.session_state.doc_count = len(docs)
133
+
134
+ st.success("βœ… Documents processed!")
135
+
136
+ # -------------------------------
137
+ # DASHBOARD
138
+ # -------------------------------
139
+ if st.session_state.qa:
140
+
141
+ st.subheader("πŸ“Š Analytics Dashboard")
142
+
143
+ df = st.session_state.df
144
+
145
+ col1, col2, col3 = st.columns(3)
146
+
147
+ col1.metric("πŸ“„ Total Documents", st.session_state.doc_count)
148
+ col2.metric("🧩 Total Chunks", st.session_state.chunk_count)
149
+ col3.metric("πŸ“ Files Uploaded", len(df))
150
+
151
+ # ---- Bar Chart ----
152
+ fig1 = px.bar(
153
+ df,
154
+ x="File",
155
+ y="Pages",
156
+ color="Type",
157
+ title="Pages per File"
158
  )
159
+ st.plotly_chart(fig1, use_container_width=True)
160
 
161
+ # ---- Pie Chart ----
162
+ fig2 = px.pie(
163
+ df,
164
+ names="Type",
165
+ title="File Type Distribution"
166
+ )
167
+ st.plotly_chart(fig2, use_container_width=True)
168
+
169
+ # ---- Line Chart ----
170
+ growth_df = pd.DataFrame({
171
+ "Stage": ["Documents", "Chunks"],
172
+ "Count": [st.session_state.doc_count, st.session_state.chunk_count]
173
+ })
174
+
175
+ fig3 = px.line(
176
+ growth_df,
177
+ x="Stage",
178
+ y="Count",
179
+ markers=True,
180
+ title="Processing Growth"
181
+ )
182
+ st.plotly_chart(fig3, use_container_width=True)
183
 
184
  # -------------------------------
185
+ # CHATBOT
186
  # -------------------------------
187
+ st.subheader("πŸ€– Chat with Documents")
 
188
 
189
+ query = st.text_input("Ask your question...")
 
 
 
190
 
191
+ if query and st.session_state.qa:
192
+ with st.spinner("Thinking..."):
193
+ result = st.session_state.qa.invoke({"query": query})
194
+ answer = result["result"]
 
 
195
 
196
+ # Save history
197
+ st.session_state.history.append((query, answer))
198
 
199
+ # -------------------------------
200
+ # CHAT HISTORY
201
+ # -------------------------------
202
+ if st.session_state.history:
203
+ st.subheader("πŸ’¬ Chat History")
204
 
205
+ for q, a in reversed(st.session_state.history):
206
+ st.markdown(f"**πŸ§‘ Question:** {q}")
207
+ st.markdown(f"**πŸ€– Answer:** {a}")
208
+ st.markdown("---")