Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import os | |
| # ------------------------------- | |
| # LANGCHAIN IMPORTS (NEW STYLE) | |
| # ------------------------------- | |
| from langchain_community.document_loaders import PyPDFLoader, TextLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.chains import RetrievalQA | |
| from langchain.prompts import PromptTemplate | |
| # Local LLM (NO API, NO TRANSFORMERS PIPELINE) | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline | |
| from langchain_community.llms import HuggingFacePipeline | |
| # Dashboard | |
| import plotly.express as px | |
| # ------------------------------- | |
| # STREAMLIT CONFIG | |
| # ------------------------------- | |
| st.set_page_config(page_title="Offline GPT RAG", layout="wide") | |
| st.title("π€ ChatGPT-like RAG (Offline) + π Dashboard") | |
| # ------------------------------- | |
| # CACHE EMBEDDINGS | |
| # ------------------------------- | |
| def load_embeddings(): | |
| return HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2" | |
| ) | |
| # ------------------------------- | |
| # LOAD LOCAL LLM (STABLE FIX) | |
| # ------------------------------- | |
| def load_llm(): | |
| model_name = "google/flan-t5-base" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
| pipe = pipeline( | |
| "text2text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_length=512 | |
| ) | |
| return HuggingFacePipeline(pipeline=pipe) | |
| # ------------------------------- | |
| # LOAD DOCUMENTS | |
| # ------------------------------- | |
| def load_documents(files): | |
| docs = [] | |
| stats = [] | |
| os.makedirs("temp", exist_ok=True) | |
| for file in files: | |
| path = os.path.join("temp", file.name) | |
| with open(path, "wb") as f: | |
| f.write(file.getbuffer()) | |
| if file.name.endswith(".pdf"): | |
| loader = PyPDFLoader(path) | |
| file_type = "PDF" | |
| else: | |
| loader = TextLoader(path) | |
| file_type = "TXT" | |
| loaded_docs = loader.load() | |
| docs.extend(loaded_docs) | |
| stats.append({ | |
| "File": file.name, | |
| "Type": file_type, | |
| "Pages": len(loaded_docs) | |
| }) | |
| return docs, pd.DataFrame(stats) | |
| # ------------------------------- | |
| # SPLIT DOCUMENTS | |
| # ------------------------------- | |
| def split_documents(docs): | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=400, | |
| chunk_overlap=50 | |
| ) | |
| return splitter.split_documents(docs) | |
| # ------------------------------- | |
| # VECTOR STORE | |
| # ------------------------------- | |
| def create_vectorstore(chunks): | |
| embeddings = load_embeddings() | |
| return FAISS.from_documents(chunks, embeddings) | |
| # ------------------------------- | |
| # QA CHAIN (FIXED PROMPT ERROR) | |
| # ------------------------------- | |
| def build_qa(vs): | |
| llm = load_llm() | |
| prompt = PromptTemplate( | |
| template=""" | |
| You are an intelligent assistant. | |
| Answer ONLY using the given context. | |
| If answer is not found, say "Not found in document". | |
| Context: | |
| {context} | |
| Question: | |
| {question} | |
| Answer: | |
| """, | |
| input_variables=["context", "question"] | |
| ) | |
| return RetrievalQA.from_chain_type( | |
| llm=llm, | |
| retriever=vs.as_retriever(search_kwargs={"k": 3}), | |
| chain_type="stuff", | |
| chain_type_kwargs={"prompt": prompt} | |
| ) | |
| # ------------------------------- | |
| # SESSION STATE | |
| # ------------------------------- | |
| if "qa" not in st.session_state: | |
| st.session_state.qa = None | |
| if "history" not in st.session_state: | |
| st.session_state.history = [] | |
| # ------------------------------- | |
| # UPLOAD FILES | |
| # ------------------------------- | |
| files = st.file_uploader( | |
| "Upload PDF / TXT files", | |
| accept_multiple_files=True | |
| ) | |
| # ------------------------------- | |
| # PROCESS PIPELINE | |
| # ------------------------------- | |
| if files and st.session_state.qa is None: | |
| with st.spinner("Processing documents..."): | |
| docs, df = load_documents(files) | |
| chunks = split_documents(docs) | |
| vs = create_vectorstore(chunks) | |
| qa = build_qa(vs) | |
| st.session_state.qa = qa | |
| st.session_state.df = df | |
| st.session_state.docs = len(docs) | |
| st.session_state.chunks = len(chunks) | |
| st.success("β Ready! Ask questions now.") | |
| # ------------------------------- | |
| # DASHBOARD | |
| # ------------------------------- | |
| if st.session_state.qa: | |
| st.subheader("π Analytics Dashboard") | |
| df = st.session_state.df | |
| col1, col2, col3 = st.columns(3) | |
| col1.metric("π Documents", st.session_state.docs) | |
| col2.metric("π§© Chunks", st.session_state.chunks) | |
| col3.metric("π Files", len(df)) | |
| # Bar chart | |
| fig1 = px.bar(df, x="File", y="Pages", color="Type", title="Pages per File") | |
| st.plotly_chart(fig1, use_container_width=True) | |
| # Pie chart | |
| fig2 = px.pie(df, names="Type", title="File Type Distribution") | |
| st.plotly_chart(fig2, use_container_width=True) | |
| # Growth chart | |
| growth = pd.DataFrame({ | |
| "Stage": ["Documents", "Chunks"], | |
| "Count": [st.session_state.docs, st.session_state.chunks] | |
| }) | |
| fig3 = px.line(growth, x="Stage", y="Count", markers=True, title="Processing Growth") | |
| st.plotly_chart(fig3, use_container_width=True) | |
| # ------------------------------- | |
| # CHAT SECTION | |
| # ------------------------------- | |
| st.subheader("π€ Chat with Documents") | |
| query = st.text_input("Ask your question") | |
| if query and st.session_state.qa: | |
| with st.spinner("Thinking..."): | |
| result = st.session_state.qa.invoke({"query": query}) | |
| answer = result["result"] | |
| st.session_state.history.append((query, answer)) | |
| st.markdown("### π§ Answer") | |
| st.write(answer) | |
| # ------------------------------- | |
| # CHAT HISTORY | |
| # ------------------------------- | |
| if st.session_state.history: | |
| st.subheader("π¬ Chat History") | |
| for q, a in reversed(st.session_state.history): | |
| st.markdown(f"**Q:** {q}") | |
| st.markdown(f"**A:** {a}") | |
| st.markdown("---") |