mrciomnl commited on
Commit
929fd26
Β·
1 Parent(s): db47027

New Branch

Browse files
Files changed (6) hide show
  1. README.md +63 -0
  2. analyzer.py +89 -0
  3. app.py +17 -0
  4. pages/1 - Introduction.py +38 -0
  5. pages/2 - Methodology.py +82 -0
  6. pages/3 - Demo.py +96 -0
README.md CHANGED
@@ -11,4 +11,67 @@ license: mit
11
  short_description: A prototype for final project in NLP
12
  ---
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
11
  short_description: A prototype for final project in NLP
12
  ---
13
 
14
+ # πŸ“„ DocuBot: PDF Analyzer
15
+
16
+ A lightweight Streamlit app that lets you analyze academic PDFs or lecture slides β€” no LLMs needed!
17
+
18
+ ---
19
+
20
+ ## πŸš€ What This App Does
21
+ - 🧠 Named Entity Recognition (NER): Extracts people, places, and organizations.
22
+ - πŸ” Document Search: Answers your custom questions using TF-IDF relevance.
23
+ - πŸ“ Extractive Summarization: Highlights the most important sentences using TextRank.
24
+ - πŸ“₯ Summary Download: Export your summary as .txt or .pdf.
25
+ - πŸŒ— Light/Dark UI toggle (Streamlit theme).
26
+
27
+ ---
28
+
29
+ ## πŸ§ͺ How It Works
30
+ - Text is extracted using pdfplumber.
31
+ - Entities are recognized using spaCy's transformer model (en_core_web_trf).
32
+ - Document search uses TF-IDF with cosine similarity.
33
+ - Summarization is done via sumy's TextRank.
34
+ - Everything runs locally in-browser via Streamlit.
35
+
36
+ ---
37
+
38
+ ## πŸ“‚ File Types Supported
39
+ - Standard PDFs (.pdf)
40
+ - Lecture slides saved as PDF (.pptx.pdf)
41
+
42
+ ---
43
+
44
+ ## πŸ§‘β€πŸ’» How to Use (on Hugging Face Spaces)
45
+ 1. Navigate to the "πŸ“‚ Demo" tab.
46
+ 2. Upload a PDF or use the provided sample.
47
+ 3. Optionally ask a question like "What is the main topic?"
48
+ 4. View the entities, relevant chunks, and summary.
49
+ 5. Download results and rate your experience.
50
+
51
+ ---
52
+
53
+ ## πŸ›  Dependencies
54
+ Add these to requirements.txt if you're running locally:
55
+
56
+ ```
57
+ streamlit
58
+ pdfplumber
59
+ spacy
60
+ en_core_web_trf
61
+ scikit-learn
62
+ sumy
63
+ fpdf
64
+ ```
65
+
66
+ ---
67
+
68
+ ## πŸ™Œ Credits
69
+ Built with πŸ’™ using open-source NLP libraries.
70
+ Project created for learning and experimentation purposes.
71
+
72
+ ---
73
+
74
+ Have fun analyzing! πŸ€–
75
+
76
+
77
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
analyzer.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import spacy
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from sumy.parsers.plaintext import PlaintextParser
6
+ from sumy.nlp.tokenizers import Tokenizer
7
+ from sumy.summarizers.text_rank import TextRankSummarizer
8
+ import re
9
+ import numpy as np
10
+
11
+ # Load spaCy transformer model
12
+ nlp = spacy.load("en_core_web_trf")
13
+
14
+ # === Text Cleaning ===
15
+ def clean_text(text):
16
+ text = re.sub(r"β€’", "", text)
17
+ text = re.sub(r"[^\x00-\x7F]+", " ", text)
18
+ text = re.sub(r"\s+", " ", text)
19
+ return text.strip()
20
+
21
+ # === PDF Extraction ===
22
+ def extract_text_from_pdf(file):
23
+ with pdfplumber.open(file) as pdf:
24
+ text = ""
25
+ for page in pdf.pages:
26
+ extracted = page.extract_text()
27
+ if extracted:
28
+ text += extracted + "\n"
29
+ return clean_text(text)
30
+
31
+ # === Named Entity Recognition ===
32
+ def perform_ner(text):
33
+ doc = nlp(text)
34
+ return {
35
+ "people": [ent.text for ent in doc.ents if ent.label_ == "PERSON"],
36
+ "places": [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}],
37
+ "organizations": [ent.text for ent in doc.ents if ent.label_ == "ORG"]
38
+ }
39
+
40
+ # === TF-IDF Relevance ===
41
+ def get_relevant_chunks(query, text, num_chunks=5):
42
+ sentences = [sent.text.strip() for sent in nlp(text).sents if len(sent.text.strip()) > 10]
43
+ vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
44
+ tfidf_matrix = vectorizer.fit_transform(sentences + [query])
45
+ cosine_sim = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
46
+ indices = cosine_sim.argsort()[0, -num_chunks:][::-1]
47
+ return [sentences[i] for i in indices]
48
+
49
+ # === Summary Cleanup ===
50
+ def deduplicate(sentences):
51
+ seen = set()
52
+ result = []
53
+ for s in sentences:
54
+ s = s.strip()
55
+ if s not in seen:
56
+ seen.add(s)
57
+ result.append(s)
58
+ return result
59
+
60
+ def is_too_technical(s):
61
+ return s.count("=") > 3 or len(s) > 300
62
+
63
+ def is_tabular(s):
64
+ return bool(re.match(r'^\d', s)) or len(re.findall(r'\d+', s)) > 6
65
+
66
+ def shorten(s, limit=250):
67
+ return s if len(s) <= limit else s[:limit].rsplit(" ", 1)[0] + "..."
68
+
69
+ def filter_summary(summary):
70
+ return [shorten(s) for s in deduplicate(summary) if not is_too_technical(s) and not is_tabular(s)]
71
+
72
+ # === TextRank Summarizer ===
73
+ def summarize_text(text, num_sentences=10):
74
+ parser = PlaintextParser.from_string(text, Tokenizer("english"))
75
+ summarizer = TextRankSummarizer()
76
+ summary = summarizer(parser.document, num_sentences)
77
+ return filter_summary([str(sentence) for sentence in summary])
78
+
79
+ # === Top-Level Function ===
80
+ def analyze_pdf(file, query):
81
+ text = extract_text_from_pdf(file)
82
+ entities = perform_ner(text)
83
+ chunks = get_relevant_chunks(query, text)
84
+ summary = summarize_text(text)
85
+ return {
86
+ "entities": entities,
87
+ "relevant_chunks": chunks,
88
+ "summary": summary
89
+ }
app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # == app.py ==
2
+ import streamlit as st
3
+
4
+ st.set_page_config(page_title="DocuBot", layout="wide", initial_sidebar_state="expanded")
5
+ st.title("πŸ€– DocuBot")
6
+ st.markdown("""
7
+ Welcome to the **DocuBott**!. Navigate using the sidebar.
8
+ - Learn about the app on the **Introduction** page
9
+ - Understand its design on the **Methodology** page
10
+ - Try it out on the **Demo** page
11
+
12
+ ✨ Features:
13
+ - PDF upload and document analysis (NER, summarization)
14
+ - Custom user query support for QA
15
+ - Downloadable summary report
16
+ - Light/dark mode compatible
17
+ """)
pages/1 - Introduction.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(page_title="πŸ“˜ Introduction", layout="wide")
4
+
5
+ st.title("πŸ“˜ Welcome to DocuBot: PDF Analyzer")
6
+
7
+ st.markdown("""
8
+ DocuBot is a lightweight, efficient, and interpretable PDF document analysis tool built for academic and technical materials.
9
+
10
+ ### 🎯 Objective
11
+ To build a reliable system that:
12
+ - Extracts text from PDF and PPTX-based lecture slides.
13
+ - Applies Named Entity Recognition (NER) to highlight important people, places, and organizations.
14
+ - Performs document-level question answering using TF-IDF.
15
+ - Summarizes content extractively using TextRank.
16
+
17
+ ### πŸ” Use Case
18
+ Whether you're studying, reviewing a report, or evaluating a research paper, DocuBot helps you:
19
+ - Quickly understand the core topics.
20
+ - Search through document segments using natural questions.
21
+ - Get concise summaries without reading everything manually.
22
+
23
+ ### πŸ’‘ Why This Matters
24
+ Many educational PDFs (especially slides) are dense with fragmented bullets and equations. DocuBot is designed to cleanly parse and analyze these, helping users:
25
+ - Save time
26
+ - Focus on relevant content
27
+ - Extract technical insights
28
+
29
+ ### 🌐 Features
30
+ - Upload .pdf and .pptx.pdf files
31
+ - Named Entity Recognition (NER)
32
+ - Relevance-based document QA
33
+ - Extractive summarization (no LLMs!)
34
+ - Optional light/dark UI themes
35
+ - Downloadable summary in .txt and .pdf
36
+
37
+ Jump to the "Demo" tab to try it yourself! πŸš€
38
+ """)
pages/2 - Methodology.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(page_title="πŸ§ͺ Methodology", layout="wide")
4
+
5
+ st.title("πŸ§ͺ Methodology")
6
+
7
+ st.markdown("""
8
+ This section outlines the techniques and processes that power DocuBot. The system uses classical NLP techniques (not large language models) for transparency, efficiency, and reproducibility.
9
+
10
+ ---
11
+
12
+ ## 🧠 Named Entity Recognition (NER)
13
+
14
+ We use spaCy's en_core_web_trf transformer model to identify:
15
+
16
+ - πŸ‘€ People (e.g., scientists, authors, historical figures)
17
+ - 🌍 Locations (cities, countries, geographic entities)
18
+ - 🏒 Organizations (institutions, universities, companies)
19
+
20
+ NER helps highlight key actors and topics within the document.
21
+
22
+ ▢️ Example:
23
+ "Bayes Theorem was developed by Thomas Bayes" ⟢ PERSON: Thomas Bayes
24
+
25
+ ---
26
+
27
+ ## πŸ”Ž Document Search (QA by TF-IDF)
28
+
29
+ We divide the document into sentences, then compute TF-IDF scores:
30
+
31
+ 1. πŸ“œ Tokenize the document into sentences
32
+ 2. πŸ“ˆ Compute TF-IDF for each sentence and the query
33
+ 3. πŸ“ Rank sentences by cosine similarity to the query
34
+
35
+ This lets the system find the most relevant chunks to a user's question.
36
+
37
+ ▢️ Example Query: "What is Naive Bayes?"
38
+ Returns the 3-5 sentences best matching the question.
39
+
40
+ ---
41
+
42
+ ## πŸ“ Extractive Summarization (TextRank)
43
+
44
+ We use the TextRank algorithm to select the most central sentences:
45
+
46
+ 1. βœ‚οΈ Tokenize into sentences
47
+ 2. πŸ”— Build a similarity graph of sentence vectors
48
+ 3. πŸ“Š Rank using PageRank-style weights
49
+ 4. 🧾 Return top-ranked sentences as summary
50
+
51
+ No neural generation β€” just high-signal extracts.
52
+
53
+ ▢️ Why TextRank?
54
+ - No training needed
55
+ - Fast and interpretable
56
+ - Works well on lecture slides and academic content
57
+
58
+ ---
59
+
60
+ ## βš™οΈ Development Workflow
61
+
62
+ πŸ“ File Types Supported:
63
+ - PDF (.pdf)
64
+ - PPTX exported as PDF (.pptx.pdf)
65
+
66
+ βš’οΈ Libraries Used:
67
+ - pdfplumber (PDF parsing)
68
+ - spaCy (NER)
69
+ - sklearn (TF-IDF, cosine similarity)
70
+ - sumy (TextRank)
71
+ - streamlit (Web UI)
72
+
73
+ πŸ“¦ Output:
74
+ - Named Entities (πŸ‘€πŸŒπŸ’)
75
+ - Relevant Text Chunks (πŸ”)
76
+ - Summary Sentences (πŸ“)
77
+ - Download buttons (.txt / .pdf)
78
+
79
+ ---
80
+
81
+ Use the "Demo" page to explore this pipeline in action! ✨
82
+ """)
pages/3 - Demo.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from analyzer import analyze_pdf
3
+ from pathlib import Path
4
+ from io import BytesIO
5
+ import base64
6
+
7
+ # Set up page
8
+ st.set_page_config(page_title="πŸ“‚ Demo - PDF Analyzer", layout="wide")
9
+ st.title("πŸ“‚ DocuBot Demo")
10
+
11
+ # Sidebar info
12
+ with st.sidebar:
13
+ st.info("""
14
+ πŸ‘ˆ Upload a PDF or use the sample provided.
15
+ Ask a question (optional) and click Analyze.
16
+ You'll receive named entities, relevant chunks, and a summary.
17
+ """)
18
+
19
+ # File upload
20
+ st.subheader("πŸ“€ Upload PDF")
21
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
22
+
23
+ # Use default sample PDF if no upload
24
+ if uploaded_file is None:
25
+ sample_path = Path("Part-Unit-2-lecture.pptx.pdf")
26
+ if sample_path.exists():
27
+ uploaded_file = open(sample_path, "rb")
28
+ st.caption("ℹ️ Using default sample: Part-Unit-2-lecture.pptx.pdf")
29
+ else:
30
+ st.warning("Please upload a PDF file to begin.")
31
+ st.stop()
32
+
33
+ # User query
34
+ query = st.text_input("πŸ”Ž Ask a question about the document (optional)", value="What is the main topic of the document?")
35
+
36
+ if st.button("πŸš€ Analyze Document"):
37
+ with st.spinner("Analyzing... this may take a few seconds..."):
38
+ result = analyze_pdf(uploaded_file, query)
39
+
40
+ st.markdown("---")
41
+ st.subheader("🧠 Named Entities")
42
+ col1, col2, col3 = st.columns(3)
43
+ col1.markdown("πŸ‘€ People")
44
+ col1.write(result["entities"].get("people", []))
45
+ col2.markdown("🌍 Places")
46
+ col2.write(result["entities"].get("places", []))
47
+ col3.markdown("🏒 Organizations")
48
+ col3.write(result["entities"].get("organizations", []))
49
+
50
+ st.markdown("---")
51
+ st.subheader("πŸ“Œ Relevant Chunks")
52
+ for i, chunk in enumerate(result["relevant_chunks"], 1):
53
+ st.markdown(f"{i}. {chunk}")
54
+
55
+ st.markdown("---")
56
+ st.subheader("πŸ“ Summary")
57
+ for i, sentence in enumerate(result["summary"], 1):
58
+ st.markdown(f"{i}. {sentence}")
59
+
60
+ # Downloads
61
+ def get_binary_file_downloader_html(bin_data, filename, label):
62
+ b64 = base64.b64encode(bin_data).decode()
63
+ href = f'<a href="data:application/octet-stream;base64,{b64}" download="{filename}">{label}</a>'
64
+ return href
65
+
66
+ st.markdown("---")
67
+ st.subheader("⬇️ Download Summary")
68
+
69
+ # .txt
70
+ txt_bytes = "\n".join(result["summary"]).encode("utf-8")
71
+ st.markdown(get_binary_file_downloader_html(txt_bytes, "summary.txt", "πŸ“„ Download as .txt"), unsafe_allow_html=True)
72
+
73
+ # .pdf (optional)
74
+ from fpdf import FPDF
75
+
76
+ # Create PDF
77
+ pdf = FPDF()
78
+ pdf.add_page()
79
+ pdf.set_font("Arial", size=12)
80
+ for line in result["summary"]:
81
+ pdf.multi_cell(0, 10, line)
82
+
83
+ # Fix: use dest='S' and encode to bytes, then wrap in BytesIO
84
+ pdf_output = pdf.output(dest='S').encode('latin1')
85
+ pdf_buffer = BytesIO(pdf_output)
86
+
87
+ # Now use this to create downloadable content
88
+ st.markdown(get_binary_file_downloader_html(pdf_buffer.getvalue(), "summary.pdf", "πŸ“‘ Download as .pdf"), unsafe_allow_html=True)
89
+
90
+
91
+ st.markdown("---")
92
+ st.subheader("⭐ Rate This App")
93
+ rating = st.radio("How satisfied are you with this analysis?", ["😑 1", "πŸ˜• 2", "😐 3", "πŸ™‚ 4", "🀩 5"])
94
+ feedback = st.text_area("πŸ’¬ Any feedback you'd like to share?")
95
+ if st.button("πŸ“© Submit Feedback"):
96
+ st.success("βœ… Thank you! Your response has been recorded.")