Chatbot / app.py
Muthuraja18's picture
Update app.py (#18)
c273036
import streamlit as st
import pandas as pd
import os
# -------------------------------
# LANGCHAIN IMPORTS (NEW STYLE)
# -------------------------------
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
# Local LLM (NO API, NO TRANSFORMERS PIPELINE)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_community.llms import HuggingFacePipeline
# Dashboard
import plotly.express as px
# -------------------------------
# STREAMLIT CONFIG
# -------------------------------
st.set_page_config(page_title="Offline GPT RAG", layout="wide")
st.title("πŸ€– ChatGPT-like RAG (Offline) + πŸ“Š Dashboard")
# -------------------------------
# CACHE EMBEDDINGS
# -------------------------------
@st.cache_resource
def load_embeddings():
return HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
# -------------------------------
# LOAD LOCAL LLM (STABLE FIX)
# -------------------------------
@st.cache_resource
def load_llm():
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipe = pipeline(
"text2text-generation",
model=model,
tokenizer=tokenizer,
max_length=512
)
return HuggingFacePipeline(pipeline=pipe)
# -------------------------------
# LOAD DOCUMENTS
# -------------------------------
def load_documents(files):
docs = []
stats = []
os.makedirs("temp", exist_ok=True)
for file in files:
path = os.path.join("temp", file.name)
with open(path, "wb") as f:
f.write(file.getbuffer())
if file.name.endswith(".pdf"):
loader = PyPDFLoader(path)
file_type = "PDF"
else:
loader = TextLoader(path)
file_type = "TXT"
loaded_docs = loader.load()
docs.extend(loaded_docs)
stats.append({
"File": file.name,
"Type": file_type,
"Pages": len(loaded_docs)
})
return docs, pd.DataFrame(stats)
# -------------------------------
# SPLIT DOCUMENTS
# -------------------------------
def split_documents(docs):
splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=50
)
return splitter.split_documents(docs)
# -------------------------------
# VECTOR STORE
# -------------------------------
def create_vectorstore(chunks):
embeddings = load_embeddings()
return FAISS.from_documents(chunks, embeddings)
# -------------------------------
# QA CHAIN (FIXED PROMPT ERROR)
# -------------------------------
def build_qa(vs):
llm = load_llm()
prompt = PromptTemplate(
template="""
You are an intelligent assistant.
Answer ONLY using the given context.
If answer is not found, say "Not found in document".
Context:
{context}
Question:
{question}
Answer:
""",
input_variables=["context", "question"]
)
return RetrievalQA.from_chain_type(
llm=llm,
retriever=vs.as_retriever(search_kwargs={"k": 3}),
chain_type="stuff",
chain_type_kwargs={"prompt": prompt}
)
# -------------------------------
# SESSION STATE
# -------------------------------
if "qa" not in st.session_state:
st.session_state.qa = None
if "history" not in st.session_state:
st.session_state.history = []
# -------------------------------
# UPLOAD FILES
# -------------------------------
files = st.file_uploader(
"Upload PDF / TXT files",
accept_multiple_files=True
)
# -------------------------------
# PROCESS PIPELINE
# -------------------------------
if files and st.session_state.qa is None:
with st.spinner("Processing documents..."):
docs, df = load_documents(files)
chunks = split_documents(docs)
vs = create_vectorstore(chunks)
qa = build_qa(vs)
st.session_state.qa = qa
st.session_state.df = df
st.session_state.docs = len(docs)
st.session_state.chunks = len(chunks)
st.success("βœ… Ready! Ask questions now.")
# -------------------------------
# DASHBOARD
# -------------------------------
if st.session_state.qa:
st.subheader("πŸ“Š Analytics Dashboard")
df = st.session_state.df
col1, col2, col3 = st.columns(3)
col1.metric("πŸ“„ Documents", st.session_state.docs)
col2.metric("🧩 Chunks", st.session_state.chunks)
col3.metric("πŸ“ Files", len(df))
# Bar chart
fig1 = px.bar(df, x="File", y="Pages", color="Type", title="Pages per File")
st.plotly_chart(fig1, use_container_width=True)
# Pie chart
fig2 = px.pie(df, names="Type", title="File Type Distribution")
st.plotly_chart(fig2, use_container_width=True)
# Growth chart
growth = pd.DataFrame({
"Stage": ["Documents", "Chunks"],
"Count": [st.session_state.docs, st.session_state.chunks]
})
fig3 = px.line(growth, x="Stage", y="Count", markers=True, title="Processing Growth")
st.plotly_chart(fig3, use_container_width=True)
# -------------------------------
# CHAT SECTION
# -------------------------------
st.subheader("πŸ€– Chat with Documents")
query = st.text_input("Ask your question")
if query and st.session_state.qa:
with st.spinner("Thinking..."):
result = st.session_state.qa.invoke({"query": query})
answer = result["result"]
st.session_state.history.append((query, answer))
st.markdown("### 🧠 Answer")
st.write(answer)
# -------------------------------
# CHAT HISTORY
# -------------------------------
if st.session_state.history:
st.subheader("πŸ’¬ Chat History")
for q, a in reversed(st.session_state.history):
st.markdown(f"**Q:** {q}")
st.markdown(f"**A:** {a}")
st.markdown("---")