Spaces:

Muthuraja18
/

Chatbot

Sleeping

App Files Files Community

Update app.py

#17

by Muthuraja18 - opened Apr 26

base: refs/heads/main

←

from: refs/pr/17

Discussion Files changed

+72

-93

Files changed (1) hide show

app.py +72 -93

app.py CHANGED Viewed

@@ -7,49 +7,51 @@ from langchain_community.document_loaders import PyPDFLoader, TextLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
-from langchain_community.llms import HuggingFacePipeline
 from langchain.chains import RetrievalQA
-# Transformers
-from transformers import pipeline
 # Charts
 import plotly.express as px
 # -------------------------------
-# PAGE CONFIG
 # -------------------------------
-st.set_page_config(page_title="RAG + Analytics", layout="wide")
-st.title("📄 RAG Chatbot + 📊 Analytics Dashboard")
 # -------------------------------
-# CACHE (VERY IMPORTANT ⚡)
 # -------------------------------
 @st.cache_resource
 def load_llm():
     pipe = pipeline(
         "text2text-generation",
-        model="google/flan-t5-base",
         max_length=512
     )
-    return HuggingFacePipeline(pipeline=pipe)
-@st.cache_resource
-def load_embeddings():
-    return HuggingFaceEmbeddings(
-        model_name="sentence-transformers/all-MiniLM-L6-v2"
-    )
 # -------------------------------
-# LOAD DOCUMENTS
 # -------------------------------
-def load_documents(files):
     docs = []
     stats = []
     for file in files:
         path = os.path.join("temp", file.name)
-        os.makedirs("temp", exist_ok=True)
         with open(path, "wb") as f:
             f.write(file.getbuffer())
@@ -73,11 +75,11 @@ def load_documents(files):
     return docs, pd.DataFrame(stats)
 # -------------------------------
-# SPLIT DOCUMENTS
 # -------------------------------
 def split_docs(docs):
     splitter = RecursiveCharacterTextSplitter(
-        chunk_size=500,
         chunk_overlap=50
     )
     return splitter.split_documents(docs)
@@ -85,30 +87,43 @@ def split_docs(docs):
 # -------------------------------
 # VECTOR STORE
 # -------------------------------
 def create_vectorstore(chunks):
-    embeddings = load_embeddings()
-    return FAISS.from_documents(chunks, embeddings)
 # -------------------------------
-# QA CHAIN
 # -------------------------------
 def build_qa(vs):
     llm = load_llm()
     return RetrievalQA.from_chain_type(
         llm=llm,
-        retriever=vs.as_retriever()
     )
 # -------------------------------
-# FILE UPLOAD
-# -------------------------------
-files = st.file_uploader(
-    "Upload PDF / TXT files",
-    accept_multiple_files=True
-)
-# -------------------------------
-# SESSION STATE
 # -------------------------------
 if "qa" not in st.session_state:
     st.session_state.qa = None
@@ -117,92 +132,56 @@ if "history" not in st.session_state:
     st.session_state.history = []
 # -------------------------------
-# PROCESS FILES
 # -------------------------------
 if files and st.session_state.qa is None:
-    with st.spinner("Processing documents..."):
-        docs, df = load_documents(files)
         chunks = split_docs(docs)
         vs = create_vectorstore(chunks)
         qa = build_qa(vs)
         st.session_state.qa = qa
         st.session_state.df = df
-        st.session_state.chunk_count = len(chunks)
         st.session_state.doc_count = len(docs)
-    st.success("✅ Documents processed!")
 # -------------------------------
 # DASHBOARD
 # -------------------------------
 if st.session_state.qa:
-    st.subheader("📊 Analytics Dashboard")
     df = st.session_state.df
-    col1, col2, col3 = st.columns(3)
-    col1.metric("📄 Total Documents", st.session_state.doc_count)
-    col2.metric("🧩 Total Chunks", st.session_state.chunk_count)
-    col3.metric("📁 Files Uploaded", len(df))
-    # ---- Bar Chart ----
-    fig1 = px.bar(
-        df,
-        x="File",
-        y="Pages",
-        color="Type",
-        title="Pages per File"
-    )
-    st.plotly_chart(fig1, use_container_width=True)
-    # ---- Pie Chart ----
-    fig2 = px.pie(
-        df,
-        names="Type",
-        title="File Type Distribution"
-    )
-    st.plotly_chart(fig2, use_container_width=True)
-    # ---- Line Chart ----
-    growth_df = pd.DataFrame({
-        "Stage": ["Documents", "Chunks"],
-        "Count": [st.session_state.doc_count, st.session_state.chunk_count]
-    })
-    fig3 = px.line(
-        growth_df,
-        x="Stage",
-        y="Count",
-        markers=True,
-        title="Processing Growth"
-    )
-    st.plotly_chart(fig3, use_container_width=True)
 # -------------------------------
-# CHATBOT
 # -------------------------------
-st.subheader("🤖 Chat with Documents")
-query = st.text_input("Ask your question...")
 if query and st.session_state.qa:
-    with st.spinner("Thinking..."):
-        result = st.session_state.qa.invoke({"query": query})
-        answer = result["result"]
-        # Save history
-        st.session_state.history.append((query, answer))
 # -------------------------------
-# CHAT HISTORY
 # -------------------------------
-if st.session_state.history:
-    st.subheader("💬 Chat History")
-    for q, a in reversed(st.session_state.history):
-        st.markdown(f"**🧑 Question:** {q}")
-        st.markdown(f"**🤖 Answer:** {a}")
-        st.markdown("---")

 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain.chains import RetrievalQA
+# Local LLM
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+from langchain_community.llms import HuggingFacePipeline
 # Charts
 import plotly.express as px
 # -------------------------------
+# CONFIG
 # -------------------------------
+st.set_page_config(page_title="Offline GPT RAG", layout="wide")
+st.title("🤖 Offline ChatGPT-like RAG + 📊 Dashboard")
 # -------------------------------
+# CACHE MODEL (IMPORTANT ⚡)
 # -------------------------------
 @st.cache_resource
 def load_llm():
+    model_name = "google/flan-t5-base"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
     pipe = pipeline(
         "text2text-generation",
+        model=model,
+        tokenizer=tokenizer,
         max_length=512
     )
+    return HuggingFacePipeline(pipeline=pipe)
 # -------------------------------
+# LOAD DOCS
 # -------------------------------
+def load_docs(files):
     docs = []
     stats = []
+    os.makedirs("temp", exist_ok=True)
     for file in files:
         path = os.path.join("temp", file.name)
         with open(path, "wb") as f:
             f.write(file.getbuffer())
     return docs, pd.DataFrame(stats)
 # -------------------------------
+# SPLIT
 # -------------------------------
 def split_docs(docs):
     splitter = RecursiveCharacterTextSplitter(
+        chunk_size=400,
         chunk_overlap=50
     )
     return splitter.split_documents(docs)
 # -------------------------------
 # VECTOR STORE
 # -------------------------------
+@st.cache_resource
+def load_embeddings():
+    return HuggingFaceEmbeddings(
+        model_name="sentence-transformers/all-MiniLM-L6-v2"
+    )
 def create_vectorstore(chunks):
+    return FAISS.from_documents(chunks, load_embeddings())
 # -------------------------------
+# QA CHAIN (BETTER PROMPT)
 # -------------------------------
 def build_qa(vs):
     llm = load_llm()
+    prompt_template = """
+    You are an intelligent assistant.
+    Answer ONLY from the provided context.
+    If the answer is not in the context, say "Not found in document".
+    Context:
+    {context}
+    Question:
+    {question}
+    Answer:
+    """
     return RetrievalQA.from_chain_type(
         llm=llm,
+        retriever=vs.as_retriever(search_kwargs={"k": 3}),
+        chain_type_kwargs={"prompt": prompt_template}
     )
 # -------------------------------
+# SESSION
 # -------------------------------
 if "qa" not in st.session_state:
     st.session_state.qa = None
     st.session_state.history = []
 # -------------------------------
+# UPLOAD
+# -------------------------------
+files = st.file_uploader("Upload PDF/TXT", accept_multiple_files=True)
+# -------------------------------
+# PROCESS
 # -------------------------------
 if files and st.session_state.qa is None:
+    with st.spinner("Processing..."):
+        docs, df = load_docs(files)
         chunks = split_docs(docs)
         vs = create_vectorstore(chunks)
         qa = build_qa(vs)
         st.session_state.qa = qa
         st.session_state.df = df
         st.session_state.doc_count = len(docs)
+        st.session_state.chunk_count = len(chunks)
+    st.success("✅ Ready!")
 # -------------------------------
 # DASHBOARD
 # -------------------------------
 if st.session_state.qa:
+    st.subheader("📊 Analytics")
     df = st.session_state.df
+    st.metric("Docs", st.session_state.doc_count)
+    st.metric("Chunks", st.session_state.chunk_count)
+    st.plotly_chart(px.bar(df, x="File", y="Pages", color="Type"))
+    st.plotly_chart(px.pie(df, names="Type"))
 # -------------------------------
+# CHAT
 # -------------------------------
+query = st.text_input("Ask your question")
 if query and st.session_state.qa:
+    result = st.session_state.qa.invoke({"query": query})
+    answer = result["result"]
+    st.session_state.history.append((query, answer))
 # -------------------------------
+# HISTORY
 # -------------------------------
+for q, a in reversed(st.session_state.history):
+    st.markdown(f"**Q:** {q}")
+    st.markdown(f"**A:** {a}")
+    st.markdown("---")