Spaces:

Muthuraja18
/

Chatbot

Sleeping

App Files Files Community

Muthuraja18 commited on 29 days ago

Commit

d13bf3b

verified ·

1 Parent(s): 9a2b77a

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -63

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import streamlit as st
-import tempfile
 import os
-# LangChain imports (new structure)
 from langchain_community.document_loaders import PyPDFLoader, TextLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
@@ -10,107 +10,199 @@ from langchain_community.vectorstores import FAISS
 from langchain_community.llms import HuggingFacePipeline
 from langchain.chains import RetrievalQA
 from transformers import pipeline
 # -------------------------------
-# Load Documents (FIXED temp file handling)
 # -------------------------------
-def load_documents(uploaded_files):
-    documents = []
-    for file in uploaded_files:
-        suffix = file.name.split(".")[-1]
-        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{suffix}") as tmp:
-            tmp.write(file.getbuffer())
-            tmp_path = tmp.name
-        if suffix == "pdf":
-            loader = PyPDFLoader(tmp_path)
-        else:
-            loader = TextLoader(tmp_path)
-        documents.extend(loader.load())
-        os.remove(tmp_path)  # cleanup
-    return documents
 # -------------------------------
-# Split Documents
 # -------------------------------
-def split_documents(documents):
     splitter = RecursiveCharacterTextSplitter(
         chunk_size=500,
         chunk_overlap=50
     )
-    return splitter.split_documents(documents)
 # -------------------------------
-# Create Vector Store
 # -------------------------------
 def create_vectorstore(chunks):
-    embeddings = HuggingFaceEmbeddings(
-        model_name="sentence-transformers/all-MiniLM-L6-v2"
-    )
     return FAISS.from_documents(chunks, embeddings)
 # -------------------------------
-# Load Local LLM (STABLE VERSION)
 # -------------------------------
-@st.cache_resource
-def load_llm():
-    pipe = pipeline(
-        task="text2text-generation",
-        model="google/flan-t5-base",
-        max_length=512,
-        do_sample=False
     )
-    return HuggingFacePipeline(pipeline=pipe)
 # -------------------------------
-# Build QA Chain
 # -------------------------------
-def build_qa(vectorstore):
-    llm = load_llm()
-    retriever = vectorstore.as_retriever()
-    qa = RetrievalQA.from_chain_type(
-        llm=llm,
-        retriever=retriever
     )
-    return qa
 # -------------------------------
-# Streamlit UI
 # -------------------------------
-st.set_page_config(page_title="RAG Chatbot", layout="wide")
-st.title("📄 Chat with Your Documents (RAG)")
-uploaded_files = st.file_uploader(
-    "Upload PDF or TXT files",
-    accept_multiple_files=True
-)
-if uploaded_files:
-    with st.spinner("Processing documents..."):
-        docs = load_documents(uploaded_files)
-        chunks = split_documents(docs)
-        vectorstore = create_vectorstore(chunks)
-        qa_chain = build_qa(vectorstore)
-    st.success("✅ Documents ready!")
-    query = st.text_input("Ask a question from your documents")
-    if query:
-        with st.spinner("Generating answer..."):
-            result = qa_chain.invoke({"query": query})
-            st.write("### Answer:")
-            st.write(result["result"])

 import streamlit as st
+import pandas as pd
 import os
+# LangChain
 from langchain_community.document_loaders import PyPDFLoader, TextLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.llms import HuggingFacePipeline
 from langchain.chains import RetrievalQA
+# Transformers
 from transformers import pipeline
+# Charts
+import plotly.express as px
 # -------------------------------
+# PAGE CONFIG
 # -------------------------------
+st.set_page_config(page_title="RAG + Analytics", layout="wide")
+st.title("📄 RAG Chatbot + 📊 Analytics Dashboard")
+# -------------------------------
+# CACHE (VERY IMPORTANT ⚡)
+# -------------------------------
+@st.cache_resource
+def load_llm():
+    pipe = pipeline(
+        "text2text-generation",
+        model="google/flan-t5-base",
+        max_length=512
+    )
+    return HuggingFacePipeline(pipeline=pipe)
+@st.cache_resource
+def load_embeddings():
+    return HuggingFaceEmbeddings(
+        model_name="sentence-transformers/all-MiniLM-L6-v2"
+    )
+# -------------------------------
+# LOAD DOCUMENTS
+# -------------------------------
+def load_documents(files):
+    docs = []
+    stats = []
+    for file in files:
+        path = os.path.join("temp", file.name)
+        os.makedirs("temp", exist_ok=True)
+        with open(path, "wb") as f:
+            f.write(file.getbuffer())
+        if file.name.endswith(".pdf"):
+            loader = PyPDFLoader(path)
+            ftype = "PDF"
+        else:
+            loader = TextLoader(path)
+            ftype = "TXT"
+        loaded = loader.load()
+        docs.extend(loaded)
+        stats.append({
+            "File": file.name,
+            "Type": ftype,
+            "Pages": len(loaded)
+        })
+    return docs, pd.DataFrame(stats)
 # -------------------------------
+# SPLIT DOCUMENTS
 # -------------------------------
+def split_docs(docs):
     splitter = RecursiveCharacterTextSplitter(
         chunk_size=500,
         chunk_overlap=50
     )
+    return splitter.split_documents(docs)
 # -------------------------------
+# VECTOR STORE
 # -------------------------------
 def create_vectorstore(chunks):
+    embeddings = load_embeddings()
     return FAISS.from_documents(chunks, embeddings)
 # -------------------------------
+# QA CHAIN
 # -------------------------------
+def build_qa(vs):
+    llm = load_llm()
+    return RetrievalQA.from_chain_type(
+        llm=llm,
+        retriever=vs.as_retriever()
     )
+# -------------------------------
+# FILE UPLOAD
+# -------------------------------
+files = st.file_uploader(
+    "Upload PDF / TXT files",
+    accept_multiple_files=True
+)
 # -------------------------------
+# SESSION STATE
 # -------------------------------
+if "qa" not in st.session_state:
+    st.session_state.qa = None
+if "history" not in st.session_state:
+    st.session_state.history = []
+# -------------------------------
+# PROCESS FILES
+# -------------------------------
+if files and st.session_state.qa is None:
+    with st.spinner("Processing documents..."):
+        docs, df = load_documents(files)
+        chunks = split_docs(docs)
+        vs = create_vectorstore(chunks)
+        qa = build_qa(vs)
+        st.session_state.qa = qa
+        st.session_state.df = df
+        st.session_state.chunk_count = len(chunks)
+        st.session_state.doc_count = len(docs)
+    st.success("✅ Documents processed!")
+# -------------------------------
+# DASHBOARD
+# -------------------------------
+if st.session_state.qa:
+    st.subheader("📊 Analytics Dashboard")
+    df = st.session_state.df
+    col1, col2, col3 = st.columns(3)
+    col1.metric("📄 Total Documents", st.session_state.doc_count)
+    col2.metric("🧩 Total Chunks", st.session_state.chunk_count)
+    col3.metric("📁 Files Uploaded", len(df))
+    # ---- Bar Chart ----
+    fig1 = px.bar(
+        df,
+        x="File",
+        y="Pages",
+        color="Type",
+        title="Pages per File"
     )
+    st.plotly_chart(fig1, use_container_width=True)
+    # ---- Pie Chart ----
+    fig2 = px.pie(
+        df,
+        names="Type",
+        title="File Type Distribution"
+    )
+    st.plotly_chart(fig2, use_container_width=True)
+    # ---- Line Chart ----
+    growth_df = pd.DataFrame({
+        "Stage": ["Documents", "Chunks"],
+        "Count": [st.session_state.doc_count, st.session_state.chunk_count]
+    })
+    fig3 = px.line(
+        growth_df,
+        x="Stage",
+        y="Count",
+        markers=True,
+        title="Processing Growth"
+    )
+    st.plotly_chart(fig3, use_container_width=True)
 # -------------------------------
+# CHATBOT
 # -------------------------------
+st.subheader("🤖 Chat with Documents")
+query = st.text_input("Ask your question...")
+if query and st.session_state.qa:
+    with st.spinner("Thinking..."):
+        result = st.session_state.qa.invoke({"query": query})
+        answer = result["result"]
+        # Save history
+        st.session_state.history.append((query, answer))
+# -------------------------------
+# CHAT HISTORY
+# -------------------------------
+if st.session_state.history:
+    st.subheader("💬 Chat History")
+    for q, a in reversed(st.session_state.history):
+        st.markdown(f"**🧑 Question:** {q}")
+        st.markdown(f"**🤖 Answer:** {a}")
+        st.markdown("---")