Spaces:

mrciomnl
/

Docubot-PDF_Analyzer

Sleeping

App Files Files Community

mrciomnl commited on Apr 29, 2025

Commit

929fd26

1 Parent(s): db47027

New Branch

Browse files

Files changed (6) hide show

README.md +63 -0
analyzer.py +89 -0
app.py +17 -0
pages/1 - Introduction.py +38 -0
pages/2 - Methodology.py +82 -0
pages/3 - Demo.py +96 -0

README.md CHANGED Viewed

@@ -11,4 +11,67 @@ license: mit
 short_description: A prototype for final project in NLP
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: A prototype for final project in NLP
 ---
+# 📄 DocuBot: PDF Analyzer
+A lightweight Streamlit app that lets you analyze academic PDFs or lecture slides — no LLMs needed!
+---
+## 🚀 What This App Does
+- 🧠 Named Entity Recognition (NER): Extracts people, places, and organizations.
+- 🔍 Document Search: Answers your custom questions using TF-IDF relevance.
+- 📝 Extractive Summarization: Highlights the most important sentences using TextRank.
+- 📥 Summary Download: Export your summary as .txt or .pdf.
+- 🌗 Light/Dark UI toggle (Streamlit theme).
+---
+## 🧪 How It Works
+- Text is extracted using pdfplumber.
+- Entities are recognized using spaCy's transformer model (en_core_web_trf).
+- Document search uses TF-IDF with cosine similarity.
+- Summarization is done via sumy's TextRank.
+- Everything runs locally in-browser via Streamlit.
+---
+## 📂 File Types Supported
+- Standard PDFs (.pdf)
+- Lecture slides saved as PDF (.pptx.pdf)
+---
+## 🧑‍💻 How to Use (on Hugging Face Spaces)
+1. Navigate to the "📂 Demo" tab.
+2. Upload a PDF or use the provided sample.
+3. Optionally ask a question like "What is the main topic?"
+4. View the entities, relevant chunks, and summary.
+5. Download results and rate your experience.
+---
+## 🛠 Dependencies
+Add these to requirements.txt if you're running locally:
+```
+streamlit
+pdfplumber
+spacy
+en_core_web_trf
+scikit-learn
+sumy
+fpdf
+```
+---
+## 🙌 Credits
+Built with 💙 using open-source NLP libraries.
+Project created for learning and experimentation purposes.
+---
+Have fun analyzing! 🤖
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

analyzer.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import pdfplumber
+import spacy
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.text_rank import TextRankSummarizer
+import re
+import numpy as np
+# Load spaCy transformer model
+nlp = spacy.load("en_core_web_trf")
+# === Text Cleaning ===
+def clean_text(text):
+    text = re.sub(r"•", "", text)
+    text = re.sub(r"[^\x00-\x7F]+", " ", text)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+# === PDF Extraction ===
+def extract_text_from_pdf(file):
+    with pdfplumber.open(file) as pdf:
+        text = ""
+        for page in pdf.pages:
+            extracted = page.extract_text()
+            if extracted:
+                text += extracted + "\n"
+    return clean_text(text)
+# === Named Entity Recognition ===
+def perform_ner(text):
+    doc = nlp(text)
+    return {
+        "people": [ent.text for ent in doc.ents if ent.label_ == "PERSON"],
+        "places": [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}],
+        "organizations": [ent.text for ent in doc.ents if ent.label_ == "ORG"]
+    }
+# === TF-IDF Relevance ===
+def get_relevant_chunks(query, text, num_chunks=5):
+    sentences = [sent.text.strip() for sent in nlp(text).sents if len(sent.text.strip()) > 10]
+    vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
+    tfidf_matrix = vectorizer.fit_transform(sentences + [query])
+    cosine_sim = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
+    indices = cosine_sim.argsort()[0, -num_chunks:][::-1]
+    return [sentences[i] for i in indices]
+# === Summary Cleanup ===
+def deduplicate(sentences):
+    seen = set()
+    result = []
+    for s in sentences:
+        s = s.strip()
+        if s not in seen:
+            seen.add(s)
+            result.append(s)
+    return result
+def is_too_technical(s):
+    return s.count("=") > 3 or len(s) > 300
+def is_tabular(s):
+    return bool(re.match(r'^\d', s)) or len(re.findall(r'\d+', s)) > 6
+def shorten(s, limit=250):
+    return s if len(s) <= limit else s[:limit].rsplit(" ", 1)[0] + "..."
+def filter_summary(summary):
+    return [shorten(s) for s in deduplicate(summary) if not is_too_technical(s) and not is_tabular(s)]
+# === TextRank Summarizer ===
+def summarize_text(text, num_sentences=10):
+    parser = PlaintextParser.from_string(text, Tokenizer("english"))
+    summarizer = TextRankSummarizer()
+    summary = summarizer(parser.document, num_sentences)
+    return filter_summary([str(sentence) for sentence in summary])
+# === Top-Level Function ===
+def analyze_pdf(file, query):
+    text = extract_text_from_pdf(file)
+    entities = perform_ner(text)
+    chunks = get_relevant_chunks(query, text)
+    summary = summarize_text(text)
+    return {
+        "entities": entities,
+        "relevant_chunks": chunks,
+        "summary": summary
+    }

app.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# == app.py ==
+import streamlit as st
+st.set_page_config(page_title="DocuBot", layout="wide", initial_sidebar_state="expanded")
+st.title("🤖 DocuBot")
+st.markdown("""
+Welcome to the **DocuBott**!. Navigate using the sidebar.
+- Learn about the app on the **Introduction** page
+- Understand its design on the **Methodology** page
+- Try it out on the **Demo** page
+✨ Features:
+- PDF upload and document analysis (NER, summarization)
+- Custom user query support for QA
+- Downloadable summary report
+- Light/dark mode compatible
+""")

pages/1 - Introduction.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import streamlit as st
+st.set_page_config(page_title="📘 Introduction", layout="wide")
+st.title("📘 Welcome to DocuBot: PDF Analyzer")
+st.markdown("""
+DocuBot is a lightweight, efficient, and interpretable PDF document analysis tool built for academic and technical materials.
+### 🎯 Objective
+To build a reliable system that:
+- Extracts text from PDF and PPTX-based lecture slides.
+- Applies Named Entity Recognition (NER) to highlight important people, places, and organizations.
+- Performs document-level question answering using TF-IDF.
+- Summarizes content extractively using TextRank.
+### 🔍 Use Case
+Whether you're studying, reviewing a report, or evaluating a research paper, DocuBot helps you:
+- Quickly understand the core topics.
+- Search through document segments using natural questions.
+- Get concise summaries without reading everything manually.
+### 💡 Why This Matters
+Many educational PDFs (especially slides) are dense with fragmented bullets and equations. DocuBot is designed to cleanly parse and analyze these, helping users:
+- Save time
+- Focus on relevant content
+- Extract technical insights
+### 🌐 Features
+- Upload .pdf and .pptx.pdf files
+- Named Entity Recognition (NER)
+- Relevance-based document QA
+- Extractive summarization (no LLMs!)
+- Optional light/dark UI themes
+- Downloadable summary in .txt and .pdf
+Jump to the "Demo" tab to try it yourself! 🚀
+""")

pages/2 - Methodology.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import streamlit as st
+st.set_page_config(page_title="🧪 Methodology", layout="wide")
+st.title("🧪 Methodology")
+st.markdown("""
+This section outlines the techniques and processes that power DocuBot. The system uses classical NLP techniques (not large language models) for transparency, efficiency, and reproducibility.
+---
+## 🧠 Named Entity Recognition (NER)
+We use spaCy's en_core_web_trf transformer model to identify:
+- 👤 People (e.g., scientists, authors, historical figures)
+- 🌍 Locations (cities, countries, geographic entities)
+- 🏢 Organizations (institutions, universities, companies)
+NER helps highlight key actors and topics within the document.
+▶️ Example:
+"Bayes Theorem was developed by Thomas Bayes"  ⟶  PERSON: Thomas Bayes
+---
+## 🔎 Document Search (QA by TF-IDF)
+We divide the document into sentences, then compute TF-IDF scores:
+1. 📜 Tokenize the document into sentences
+2. 📈 Compute TF-IDF for each sentence and the query
+3. 📏 Rank sentences by cosine similarity to the query
+This lets the system find the most relevant chunks to a user's question.
+▶️ Example Query: "What is Naive Bayes?"
+Returns the 3-5 sentences best matching the question.
+---
+## 📝 Extractive Summarization (TextRank)
+We use the TextRank algorithm to select the most central sentences:
+1. ✂️ Tokenize into sentences
+2. 🔗 Build a similarity graph of sentence vectors
+3. 📊 Rank using PageRank-style weights
+4. 🧾 Return top-ranked sentences as summary
+No neural generation — just high-signal extracts.
+▶️ Why TextRank?
+- No training needed
+- Fast and interpretable
+- Works well on lecture slides and academic content
+---
+## ⚙️ Development Workflow
+📁 File Types Supported:
+- PDF (.pdf)
+- PPTX exported as PDF (.pptx.pdf)
+⚒️ Libraries Used:
+- pdfplumber (PDF parsing)
+- spaCy (NER)
+- sklearn (TF-IDF, cosine similarity)
+- sumy (TextRank)
+- streamlit (Web UI)
+📦 Output:
+- Named Entities (👤🌍🏢)
+- Relevant Text Chunks (🔍)
+- Summary Sentences (📝)
+- Download buttons (.txt / .pdf)
+---
+Use the "Demo" page to explore this pipeline in action! ✨
+""")

pages/3 - Demo.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import streamlit as st
+from analyzer import analyze_pdf
+from pathlib import Path
+from io import BytesIO
+import base64
+# Set up page
+st.set_page_config(page_title="📂 Demo - PDF Analyzer", layout="wide")
+st.title("📂 DocuBot Demo")
+# Sidebar info
+with st.sidebar:
+    st.info("""
+    👈 Upload a PDF or use the sample provided.
+    Ask a question (optional) and click Analyze.
+    You'll receive named entities, relevant chunks, and a summary.
+    """)
+# File upload
+st.subheader("📤 Upload PDF")
+uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
+# Use default sample PDF if no upload
+if uploaded_file is None:
+    sample_path = Path("Part-Unit-2-lecture.pptx.pdf")
+    if sample_path.exists():
+        uploaded_file = open(sample_path, "rb")
+        st.caption("ℹ️ Using default sample: Part-Unit-2-lecture.pptx.pdf")
+    else:
+        st.warning("Please upload a PDF file to begin.")
+        st.stop()
+# User query
+query = st.text_input("🔎 Ask a question about the document (optional)", value="What is the main topic of the document?")
+if st.button("🚀 Analyze Document"):
+    with st.spinner("Analyzing... this may take a few seconds..."):
+        result = analyze_pdf(uploaded_file, query)
+    st.markdown("---")
+    st.subheader("🧠 Named Entities")
+    col1, col2, col3 = st.columns(3)
+    col1.markdown("👤 People")
+    col1.write(result["entities"].get("people", []))
+    col2.markdown("🌍 Places")
+    col2.write(result["entities"].get("places", []))
+    col3.markdown("🏢 Organizations")
+    col3.write(result["entities"].get("organizations", []))
+    st.markdown("---")
+    st.subheader("📌 Relevant Chunks")
+    for i, chunk in enumerate(result["relevant_chunks"], 1):
+        st.markdown(f"{i}. {chunk}")
+    st.markdown("---")
+    st.subheader("📝 Summary")
+    for i, sentence in enumerate(result["summary"], 1):
+        st.markdown(f"{i}. {sentence}")
+    # Downloads
+    def get_binary_file_downloader_html(bin_data, filename, label):
+        b64 = base64.b64encode(bin_data).decode()
+        href = f'<a href="data:application/octet-stream;base64,{b64}" download="{filename}">{label}</a>'
+        return href
+    st.markdown("---")
+    st.subheader("⬇️ Download Summary")
+    # .txt
+    txt_bytes = "\n".join(result["summary"]).encode("utf-8")
+    st.markdown(get_binary_file_downloader_html(txt_bytes, "summary.txt", "📄 Download as .txt"), unsafe_allow_html=True)
+    # .pdf (optional)
+    from fpdf import FPDF
+    # Create PDF
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    for line in result["summary"]:
+        pdf.multi_cell(0, 10, line)
+    # Fix: use dest='S' and encode to bytes, then wrap in BytesIO
+    pdf_output = pdf.output(dest='S').encode('latin1')
+    pdf_buffer = BytesIO(pdf_output)
+    # Now use this to create downloadable content
+    st.markdown(get_binary_file_downloader_html(pdf_buffer.getvalue(), "summary.pdf", "📑 Download as .pdf"), unsafe_allow_html=True)
+    st.markdown("---")
+    st.subheader("⭐ Rate This App")
+    rating = st.radio("How satisfied are you with this analysis?", ["😡 1", "😕 2", "😐 3", "🙂 4", "🤩 5"])
+    feedback = st.text_area("💬 Any feedback you'd like to share?")
+    if st.button("📩 Submit Feedback"):
+        st.success("✅ Thank you! Your response has been recorded.")