Spaces:

rahideer
/

dataset

Sleeping

App Files Files Community

rahideer commited on Apr 18, 2025

Commit

d28742c

verified ·

1 Parent(s): dc5042a

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -114

app.py CHANGED Viewed

@@ -1,122 +1,98 @@
 import streamlit as st
-from PyPDF2 import PdfReader
 from sentence_transformers import SentenceTransformer
-from transformers import pipeline
 import faiss
 import numpy as np
-# ---------- Custom CSS for UI ----------
-def apply_custom_style():
-    st.markdown("""
-        <style>
-        html, body, [class*="css"]  {
-            font-family: 'Segoe UI', sans-serif;
-            background-color: #f0f4ff;
-        }
-        .title {
-            background: linear-gradient(to right, #4a90e2, #00c6ff);
-            -webkit-background-clip: text;
-            -webkit-text-fill-color: transparent;
-            font-size: 2.5em;
-            font-weight: bold;
-        }
-        .subtitle {
-            color: #444;
-            font-size: 1.2em;
-            margin-bottom: 1rem;
-        }
-        .question-box {
-            background-color: #fff;
-            padding: 1rem;
-            border-radius: 10px;
-            box-shadow: 0px 2px 10px rgba(0,0,0,0.1);
-            margin-bottom: 1rem;
-        }
-        .example {
-            color: #444;
-            background: #e9f0ff;
-            padding: 0.5rem;
-            border-radius: 8px;
-            margin: 3px 0;
-            cursor: pointer;
-        }
-        </style>
-    """, unsafe_allow_html=True)
-# ---------- PDF Reading ----------
-def load_pdf_text(pdf_path):
-    reader = PdfReader(pdf_path)
-    text = ''
-    for page in reader.pages:
-        if page.extract_text():
-            text += page.extract_text()
-    return text
-# ---------- Chunking ----------
-def chunk_text(text, max_len=500):
-    sentences = text.split('. ')
-    chunks, chunk = [], ''
-    for sentence in sentences:
-        if len(chunk) + len(sentence) <= max_len:
-            chunk += sentence + '. '
-        else:
-            chunks.append(chunk.strip())
-            chunk = sentence + '. '
-    if chunk:
-        chunks.append(chunk.strip())
-    return chunks
-# ---------- Embedding ----------
-@st.cache_resource
-def embed_chunks(chunks):
-    model = SentenceTransformer('all-MiniLM-L6-v2')
-    embeddings = model.encode(chunks)
-    return embeddings, model
-# ---------- RAG-Based QA ----------
-def answer_query(query, embeddings, chunks, model, qa_pipeline):
-    query_embedding = model.encode([query])
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(np.array(embeddings))
-    _, I = index.search(np.array(query_embedding), k=5)  # retrieve top 5 chunks
-    context = "\n\n".join([chunks[i] for i in I[0]])     # longer, better context
-    result = qa_pipeline(question=query, context=context)
-    return result['answer']
-# ---------- App Layout ----------
-apply_custom_style()
-st.markdown('<div class="title">🤖 RAG PDF Q&A App</div>', unsafe_allow_html=True)
-st.markdown('<div class="subtitle">Ask questions about a machine learning PDF. Powered by Transformers and FAISS!</div>', unsafe_allow_html=True)
-# ---------- Load PDF ----------
-pdf_path = "ml_dataset_25_pages.pdf"
-raw_text = load_pdf_text(pdf_path)
-chunks = chunk_text(raw_text)
-embeddings, embedder = embed_chunks(chunks)
-# ---------- QA Pipeline ----------
-qa = pipeline(
-    "question-answering",
-    model="deepset/roberta-base-squad2",
-    tokenizer="deepset/roberta-base-squad2"
-)
-# ---------- Sample Questions ----------
-st.markdown('<div class="question-box"><strong>💡 Sample Questions:</strong>', unsafe_allow_html=True)
-sample_questions = [
-    "What is supervised learning?",
-    "Explain the difference between regression and classification.",
-    "What are the applications of machine learning?",
-    "How does decision tree algorithm work?",
-    "What is overfitting in machine learning?"
-]
-for q in sample_questions:
-    st.markdown(f'<div class="example">{q}</div>', unsafe_allow_html=True)
-st.markdown('</div>', unsafe_allow_html=True)
-# ---------- User Query ----------
-query = st.text_input("🔎 Ask your question here:")
-if query:
-    with st.spinner("Thinking..."):
-        answer = answer_query(query, embeddings, chunks, embedder, qa)
-    st.success(f"🧠 Answer: {answer}")

 import streamlit as st
+import PyPDF2
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
+from transformers import pipeline
+st.set_page_config(page_title="📘 PDF QA RAG App", layout="wide")
+# Custom styles
+st.markdown("""
+    <style>
+    .main {background-color: #f7faff;}
+    .block-container {padding-top: 2rem;}
+    h1 {color: #4051b5;}
+    .stTextInput>div>div>input {border: 2px solid #d0d7ff;}
+    .stButton button {background-color: #4051b5; color: white; border-radius: 6px;}
+    .stSidebar {background-color: #eaf0ff;}
+    .sample-dropdown label {font-weight: bold;}
+    </style>
+""", unsafe_allow_html=True)
+st.title("📘 Ask Me Anything From Your PDF")
+st.caption("Built using RAG (Retrieval-Augmented Generation) ✨")
+st.sidebar.header("📁 Upload PDF")
+uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type=["pdf"])
+default_questions = [
+    "What is machine learning?",
+    "Explain generalization in ML.",
+    "What are different types of ML?",
+    "How is ML used in computer vision?",
+    "Describe the importance of training data."
+]
+@st.cache_data
+def load_pdf(file):
+    reader = PyPDF2.PdfReader(file)
+    return [page.extract_text() for page in reader.pages]
+def chunk_text(pages, max_len=1000):
+    text = " ".join(pages)
+    words = text.split()
+    return [' '.join(words[i:i+max_len]) for i in range(0, len(words), max_len)]
+def create_faiss_index(chunks, model):
+    embeddings = model.encode(chunks)
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(np.array(embeddings))
+    return index, embeddings
+def retrieve_context(question, chunks, index, model, k=6):
+    q_embedding = model.encode([question])
+    _, I = index.search(np.array(q_embedding), k)
+    return "\n\n".join([chunks[i] for i in I[0]])
+if uploaded_file:
+    st.success("✅ PDF uploaded successfully!")
+    pages = load_pdf(uploaded_file)
+    chunks = chunk_text(pages)
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    index, _ = create_faiss_index(chunks, model)
+    qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
+    st.subheader("💬 Ask a question")
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        question = st.text_input("Enter your question here...", placeholder="e.g. What is deep learning?")
+    with col2:
+        if st.button("Ask"):
+            with st.spinner("🧠 Thinking..."):
+                context = retrieve_context(question, chunks, index, model)
+                result = qa_pipeline(question=question, context=context)
+                with st.expander("📖 Answer", expanded=True):
+                    st.markdown(result['answer'])
+    st.divider()
+    st.subheader("✨ Sample Questions")
+    selected_q = st.selectbox("Pick one to try:", default_questions, key="sample-dropdown")
+    if st.button("Try Selected Question"):
+        with st.spinner("⏳ Searching..."):
+            context = retrieve_context(selected_q, chunks, index, model)
+            result = qa_pipeline(question=selected_q, context=context)
+            with st.expander(f"💡 Answer to: '{selected_q}'", expanded=True):
+                st.markdown(result['answer'])
+    st.divider()
+    st.subheader("📄 Preview PDF Pages")
+    for i, page in enumerate(pages[:3]):
+        st.markdown(f"**Page {i+1}**")
+        st.code(page[:800] + "..." if len(page) > 800 else page)
+else:
+    st.info("Upload a PDF from the sidebar to begin.")