import streamlit as st import pandas as pd import os # ------------------------------- # LANGCHAIN IMPORTS (NEW STYLE) # ------------------------------- from langchain_community.document_loaders import PyPDFLoader, TextLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate # Local LLM (NO API, NO TRANSFORMERS PIPELINE) from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from langchain_community.llms import HuggingFacePipeline # Dashboard import plotly.express as px # ------------------------------- # STREAMLIT CONFIG # ------------------------------- st.set_page_config(page_title="Offline GPT RAG", layout="wide") st.title("🤖 ChatGPT-like RAG (Offline) + 📊 Dashboard") # ------------------------------- # CACHE EMBEDDINGS # ------------------------------- @st.cache_resource def load_embeddings(): return HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) # ------------------------------- # LOAD LOCAL LLM (STABLE FIX) # ------------------------------- @st.cache_resource def load_llm(): model_name = "google/flan-t5-base" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) pipe = pipeline( "text2text-generation", model=model, tokenizer=tokenizer, max_length=512 ) return HuggingFacePipeline(pipeline=pipe) # ------------------------------- # LOAD DOCUMENTS # ------------------------------- def load_documents(files): docs = [] stats = [] os.makedirs("temp", exist_ok=True) for file in files: path = os.path.join("temp", file.name) with open(path, "wb") as f: f.write(file.getbuffer()) if file.name.endswith(".pdf"): loader = PyPDFLoader(path) file_type = "PDF" else: loader = TextLoader(path) file_type = "TXT" loaded_docs = loader.load() docs.extend(loaded_docs) stats.append({ "File": file.name, "Type": file_type, "Pages": len(loaded_docs) }) return docs, pd.DataFrame(stats) # ------------------------------- # SPLIT DOCUMENTS # ------------------------------- def split_documents(docs): splitter = RecursiveCharacterTextSplitter( chunk_size=400, chunk_overlap=50 ) return splitter.split_documents(docs) # ------------------------------- # VECTOR STORE # ------------------------------- def create_vectorstore(chunks): embeddings = load_embeddings() return FAISS.from_documents(chunks, embeddings) # ------------------------------- # QA CHAIN (FIXED PROMPT ERROR) # ------------------------------- def build_qa(vs): llm = load_llm() prompt = PromptTemplate( template=""" You are an intelligent assistant. Answer ONLY using the given context. If answer is not found, say "Not found in document". Context: {context} Question: {question} Answer: """, input_variables=["context", "question"] ) return RetrievalQA.from_chain_type( llm=llm, retriever=vs.as_retriever(search_kwargs={"k": 3}), chain_type="stuff", chain_type_kwargs={"prompt": prompt} ) # ------------------------------- # SESSION STATE # ------------------------------- if "qa" not in st.session_state: st.session_state.qa = None if "history" not in st.session_state: st.session_state.history = [] # ------------------------------- # UPLOAD FILES # ------------------------------- files = st.file_uploader( "Upload PDF / TXT files", accept_multiple_files=True ) # ------------------------------- # PROCESS PIPELINE # ------------------------------- if files and st.session_state.qa is None: with st.spinner("Processing documents..."): docs, df = load_documents(files) chunks = split_documents(docs) vs = create_vectorstore(chunks) qa = build_qa(vs) st.session_state.qa = qa st.session_state.df = df st.session_state.docs = len(docs) st.session_state.chunks = len(chunks) st.success("✅ Ready! Ask questions now.") # ------------------------------- # DASHBOARD # ------------------------------- if st.session_state.qa: st.subheader("📊 Analytics Dashboard") df = st.session_state.df col1, col2, col3 = st.columns(3) col1.metric("📄 Documents", st.session_state.docs) col2.metric("🧩 Chunks", st.session_state.chunks) col3.metric("📁 Files", len(df)) # Bar chart fig1 = px.bar(df, x="File", y="Pages", color="Type", title="Pages per File") st.plotly_chart(fig1, use_container_width=True) # Pie chart fig2 = px.pie(df, names="Type", title="File Type Distribution") st.plotly_chart(fig2, use_container_width=True) # Growth chart growth = pd.DataFrame({ "Stage": ["Documents", "Chunks"], "Count": [st.session_state.docs, st.session_state.chunks] }) fig3 = px.line(growth, x="Stage", y="Count", markers=True, title="Processing Growth") st.plotly_chart(fig3, use_container_width=True) # ------------------------------- # CHAT SECTION # ------------------------------- st.subheader("🤖 Chat with Documents") query = st.text_input("Ask your question") if query and st.session_state.qa: with st.spinner("Thinking..."): result = st.session_state.qa.invoke({"query": query}) answer = result["result"] st.session_state.history.append((query, answer)) st.markdown("### 🧠 Answer") st.write(answer) # ------------------------------- # CHAT HISTORY # ------------------------------- if st.session_state.history: st.subheader("💬 Chat History") for q, a in reversed(st.session_state.history): st.markdown(f"**Q:** {q}") st.markdown(f"**A:** {a}") st.markdown("---")