Spaces:
Sleeping
Sleeping
New Branch
Browse files- README.md +63 -0
- analyzer.py +89 -0
- app.py +17 -0
- pages/1 - Introduction.py +38 -0
- pages/2 - Methodology.py +82 -0
- pages/3 - Demo.py +96 -0
README.md
CHANGED
|
@@ -11,4 +11,67 @@ license: mit
|
|
| 11 |
short_description: A prototype for final project in NLP
|
| 12 |
---
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 11 |
short_description: A prototype for final project in NLP
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# π DocuBot: PDF Analyzer
|
| 15 |
+
|
| 16 |
+
A lightweight Streamlit app that lets you analyze academic PDFs or lecture slides β no LLMs needed!
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
## π What This App Does
|
| 21 |
+
- π§ Named Entity Recognition (NER): Extracts people, places, and organizations.
|
| 22 |
+
- π Document Search: Answers your custom questions using TF-IDF relevance.
|
| 23 |
+
- π Extractive Summarization: Highlights the most important sentences using TextRank.
|
| 24 |
+
- π₯ Summary Download: Export your summary as .txt or .pdf.
|
| 25 |
+
- π Light/Dark UI toggle (Streamlit theme).
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
## π§ͺ How It Works
|
| 30 |
+
- Text is extracted using pdfplumber.
|
| 31 |
+
- Entities are recognized using spaCy's transformer model (en_core_web_trf).
|
| 32 |
+
- Document search uses TF-IDF with cosine similarity.
|
| 33 |
+
- Summarization is done via sumy's TextRank.
|
| 34 |
+
- Everything runs locally in-browser via Streamlit.
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## π File Types Supported
|
| 39 |
+
- Standard PDFs (.pdf)
|
| 40 |
+
- Lecture slides saved as PDF (.pptx.pdf)
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
## π§βπ» How to Use (on Hugging Face Spaces)
|
| 45 |
+
1. Navigate to the "π Demo" tab.
|
| 46 |
+
2. Upload a PDF or use the provided sample.
|
| 47 |
+
3. Optionally ask a question like "What is the main topic?"
|
| 48 |
+
4. View the entities, relevant chunks, and summary.
|
| 49 |
+
5. Download results and rate your experience.
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## π Dependencies
|
| 54 |
+
Add these to requirements.txt if you're running locally:
|
| 55 |
+
|
| 56 |
+
```
|
| 57 |
+
streamlit
|
| 58 |
+
pdfplumber
|
| 59 |
+
spacy
|
| 60 |
+
en_core_web_trf
|
| 61 |
+
scikit-learn
|
| 62 |
+
sumy
|
| 63 |
+
fpdf
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
## π Credits
|
| 69 |
+
Built with π using open-source NLP libraries.
|
| 70 |
+
Project created for learning and experimentation purposes.
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
Have fun analyzing! π€
|
| 75 |
+
|
| 76 |
+
|
| 77 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
analyzer.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pdfplumber
|
| 2 |
+
import spacy
|
| 3 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
+
from sumy.parsers.plaintext import PlaintextParser
|
| 6 |
+
from sumy.nlp.tokenizers import Tokenizer
|
| 7 |
+
from sumy.summarizers.text_rank import TextRankSummarizer
|
| 8 |
+
import re
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
# Load spaCy transformer model
|
| 12 |
+
nlp = spacy.load("en_core_web_trf")
|
| 13 |
+
|
| 14 |
+
# === Text Cleaning ===
|
| 15 |
+
def clean_text(text):
|
| 16 |
+
text = re.sub(r"β’", "", text)
|
| 17 |
+
text = re.sub(r"[^\x00-\x7F]+", " ", text)
|
| 18 |
+
text = re.sub(r"\s+", " ", text)
|
| 19 |
+
return text.strip()
|
| 20 |
+
|
| 21 |
+
# === PDF Extraction ===
|
| 22 |
+
def extract_text_from_pdf(file):
|
| 23 |
+
with pdfplumber.open(file) as pdf:
|
| 24 |
+
text = ""
|
| 25 |
+
for page in pdf.pages:
|
| 26 |
+
extracted = page.extract_text()
|
| 27 |
+
if extracted:
|
| 28 |
+
text += extracted + "\n"
|
| 29 |
+
return clean_text(text)
|
| 30 |
+
|
| 31 |
+
# === Named Entity Recognition ===
|
| 32 |
+
def perform_ner(text):
|
| 33 |
+
doc = nlp(text)
|
| 34 |
+
return {
|
| 35 |
+
"people": [ent.text for ent in doc.ents if ent.label_ == "PERSON"],
|
| 36 |
+
"places": [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}],
|
| 37 |
+
"organizations": [ent.text for ent in doc.ents if ent.label_ == "ORG"]
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
# === TF-IDF Relevance ===
|
| 41 |
+
def get_relevant_chunks(query, text, num_chunks=5):
|
| 42 |
+
sentences = [sent.text.strip() for sent in nlp(text).sents if len(sent.text.strip()) > 10]
|
| 43 |
+
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
|
| 44 |
+
tfidf_matrix = vectorizer.fit_transform(sentences + [query])
|
| 45 |
+
cosine_sim = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
|
| 46 |
+
indices = cosine_sim.argsort()[0, -num_chunks:][::-1]
|
| 47 |
+
return [sentences[i] for i in indices]
|
| 48 |
+
|
| 49 |
+
# === Summary Cleanup ===
|
| 50 |
+
def deduplicate(sentences):
|
| 51 |
+
seen = set()
|
| 52 |
+
result = []
|
| 53 |
+
for s in sentences:
|
| 54 |
+
s = s.strip()
|
| 55 |
+
if s not in seen:
|
| 56 |
+
seen.add(s)
|
| 57 |
+
result.append(s)
|
| 58 |
+
return result
|
| 59 |
+
|
| 60 |
+
def is_too_technical(s):
|
| 61 |
+
return s.count("=") > 3 or len(s) > 300
|
| 62 |
+
|
| 63 |
+
def is_tabular(s):
|
| 64 |
+
return bool(re.match(r'^\d', s)) or len(re.findall(r'\d+', s)) > 6
|
| 65 |
+
|
| 66 |
+
def shorten(s, limit=250):
|
| 67 |
+
return s if len(s) <= limit else s[:limit].rsplit(" ", 1)[0] + "..."
|
| 68 |
+
|
| 69 |
+
def filter_summary(summary):
|
| 70 |
+
return [shorten(s) for s in deduplicate(summary) if not is_too_technical(s) and not is_tabular(s)]
|
| 71 |
+
|
| 72 |
+
# === TextRank Summarizer ===
|
| 73 |
+
def summarize_text(text, num_sentences=10):
|
| 74 |
+
parser = PlaintextParser.from_string(text, Tokenizer("english"))
|
| 75 |
+
summarizer = TextRankSummarizer()
|
| 76 |
+
summary = summarizer(parser.document, num_sentences)
|
| 77 |
+
return filter_summary([str(sentence) for sentence in summary])
|
| 78 |
+
|
| 79 |
+
# === Top-Level Function ===
|
| 80 |
+
def analyze_pdf(file, query):
|
| 81 |
+
text = extract_text_from_pdf(file)
|
| 82 |
+
entities = perform_ner(text)
|
| 83 |
+
chunks = get_relevant_chunks(query, text)
|
| 84 |
+
summary = summarize_text(text)
|
| 85 |
+
return {
|
| 86 |
+
"entities": entities,
|
| 87 |
+
"relevant_chunks": chunks,
|
| 88 |
+
"summary": summary
|
| 89 |
+
}
|
app.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# == app.py ==
|
| 2 |
+
import streamlit as st
|
| 3 |
+
|
| 4 |
+
st.set_page_config(page_title="DocuBot", layout="wide", initial_sidebar_state="expanded")
|
| 5 |
+
st.title("π€ DocuBot")
|
| 6 |
+
st.markdown("""
|
| 7 |
+
Welcome to the **DocuBott**!. Navigate using the sidebar.
|
| 8 |
+
- Learn about the app on the **Introduction** page
|
| 9 |
+
- Understand its design on the **Methodology** page
|
| 10 |
+
- Try it out on the **Demo** page
|
| 11 |
+
|
| 12 |
+
β¨ Features:
|
| 13 |
+
- PDF upload and document analysis (NER, summarization)
|
| 14 |
+
- Custom user query support for QA
|
| 15 |
+
- Downloadable summary report
|
| 16 |
+
- Light/dark mode compatible
|
| 17 |
+
""")
|
pages/1 - Introduction.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
st.set_page_config(page_title="π Introduction", layout="wide")
|
| 4 |
+
|
| 5 |
+
st.title("π Welcome to DocuBot: PDF Analyzer")
|
| 6 |
+
|
| 7 |
+
st.markdown("""
|
| 8 |
+
DocuBot is a lightweight, efficient, and interpretable PDF document analysis tool built for academic and technical materials.
|
| 9 |
+
|
| 10 |
+
### π― Objective
|
| 11 |
+
To build a reliable system that:
|
| 12 |
+
- Extracts text from PDF and PPTX-based lecture slides.
|
| 13 |
+
- Applies Named Entity Recognition (NER) to highlight important people, places, and organizations.
|
| 14 |
+
- Performs document-level question answering using TF-IDF.
|
| 15 |
+
- Summarizes content extractively using TextRank.
|
| 16 |
+
|
| 17 |
+
### π Use Case
|
| 18 |
+
Whether you're studying, reviewing a report, or evaluating a research paper, DocuBot helps you:
|
| 19 |
+
- Quickly understand the core topics.
|
| 20 |
+
- Search through document segments using natural questions.
|
| 21 |
+
- Get concise summaries without reading everything manually.
|
| 22 |
+
|
| 23 |
+
### π‘ Why This Matters
|
| 24 |
+
Many educational PDFs (especially slides) are dense with fragmented bullets and equations. DocuBot is designed to cleanly parse and analyze these, helping users:
|
| 25 |
+
- Save time
|
| 26 |
+
- Focus on relevant content
|
| 27 |
+
- Extract technical insights
|
| 28 |
+
|
| 29 |
+
### π Features
|
| 30 |
+
- Upload .pdf and .pptx.pdf files
|
| 31 |
+
- Named Entity Recognition (NER)
|
| 32 |
+
- Relevance-based document QA
|
| 33 |
+
- Extractive summarization (no LLMs!)
|
| 34 |
+
- Optional light/dark UI themes
|
| 35 |
+
- Downloadable summary in .txt and .pdf
|
| 36 |
+
|
| 37 |
+
Jump to the "Demo" tab to try it yourself! π
|
| 38 |
+
""")
|
pages/2 - Methodology.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
st.set_page_config(page_title="π§ͺ Methodology", layout="wide")
|
| 4 |
+
|
| 5 |
+
st.title("π§ͺ Methodology")
|
| 6 |
+
|
| 7 |
+
st.markdown("""
|
| 8 |
+
This section outlines the techniques and processes that power DocuBot. The system uses classical NLP techniques (not large language models) for transparency, efficiency, and reproducibility.
|
| 9 |
+
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## π§ Named Entity Recognition (NER)
|
| 13 |
+
|
| 14 |
+
We use spaCy's en_core_web_trf transformer model to identify:
|
| 15 |
+
|
| 16 |
+
- π€ People (e.g., scientists, authors, historical figures)
|
| 17 |
+
- π Locations (cities, countries, geographic entities)
|
| 18 |
+
- π’ Organizations (institutions, universities, companies)
|
| 19 |
+
|
| 20 |
+
NER helps highlight key actors and topics within the document.
|
| 21 |
+
|
| 22 |
+
βΆοΈ Example:
|
| 23 |
+
"Bayes Theorem was developed by Thomas Bayes" βΆ PERSON: Thomas Bayes
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
## π Document Search (QA by TF-IDF)
|
| 28 |
+
|
| 29 |
+
We divide the document into sentences, then compute TF-IDF scores:
|
| 30 |
+
|
| 31 |
+
1. π Tokenize the document into sentences
|
| 32 |
+
2. π Compute TF-IDF for each sentence and the query
|
| 33 |
+
3. π Rank sentences by cosine similarity to the query
|
| 34 |
+
|
| 35 |
+
This lets the system find the most relevant chunks to a user's question.
|
| 36 |
+
|
| 37 |
+
βΆοΈ Example Query: "What is Naive Bayes?"
|
| 38 |
+
Returns the 3-5 sentences best matching the question.
|
| 39 |
+
|
| 40 |
+
---
|
| 41 |
+
|
| 42 |
+
## π Extractive Summarization (TextRank)
|
| 43 |
+
|
| 44 |
+
We use the TextRank algorithm to select the most central sentences:
|
| 45 |
+
|
| 46 |
+
1. βοΈ Tokenize into sentences
|
| 47 |
+
2. π Build a similarity graph of sentence vectors
|
| 48 |
+
3. π Rank using PageRank-style weights
|
| 49 |
+
4. π§Ύ Return top-ranked sentences as summary
|
| 50 |
+
|
| 51 |
+
No neural generation β just high-signal extracts.
|
| 52 |
+
|
| 53 |
+
βΆοΈ Why TextRank?
|
| 54 |
+
- No training needed
|
| 55 |
+
- Fast and interpretable
|
| 56 |
+
- Works well on lecture slides and academic content
|
| 57 |
+
|
| 58 |
+
---
|
| 59 |
+
|
| 60 |
+
## βοΈ Development Workflow
|
| 61 |
+
|
| 62 |
+
π File Types Supported:
|
| 63 |
+
- PDF (.pdf)
|
| 64 |
+
- PPTX exported as PDF (.pptx.pdf)
|
| 65 |
+
|
| 66 |
+
βοΈ Libraries Used:
|
| 67 |
+
- pdfplumber (PDF parsing)
|
| 68 |
+
- spaCy (NER)
|
| 69 |
+
- sklearn (TF-IDF, cosine similarity)
|
| 70 |
+
- sumy (TextRank)
|
| 71 |
+
- streamlit (Web UI)
|
| 72 |
+
|
| 73 |
+
π¦ Output:
|
| 74 |
+
- Named Entities (π€ππ’)
|
| 75 |
+
- Relevant Text Chunks (π)
|
| 76 |
+
- Summary Sentences (π)
|
| 77 |
+
- Download buttons (.txt / .pdf)
|
| 78 |
+
|
| 79 |
+
---
|
| 80 |
+
|
| 81 |
+
Use the "Demo" page to explore this pipeline in action! β¨
|
| 82 |
+
""")
|
pages/3 - Demo.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from analyzer import analyze_pdf
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
import base64
|
| 6 |
+
|
| 7 |
+
# Set up page
|
| 8 |
+
st.set_page_config(page_title="π Demo - PDF Analyzer", layout="wide")
|
| 9 |
+
st.title("π DocuBot Demo")
|
| 10 |
+
|
| 11 |
+
# Sidebar info
|
| 12 |
+
with st.sidebar:
|
| 13 |
+
st.info("""
|
| 14 |
+
π Upload a PDF or use the sample provided.
|
| 15 |
+
Ask a question (optional) and click Analyze.
|
| 16 |
+
You'll receive named entities, relevant chunks, and a summary.
|
| 17 |
+
""")
|
| 18 |
+
|
| 19 |
+
# File upload
|
| 20 |
+
st.subheader("π€ Upload PDF")
|
| 21 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
|
| 22 |
+
|
| 23 |
+
# Use default sample PDF if no upload
|
| 24 |
+
if uploaded_file is None:
|
| 25 |
+
sample_path = Path("Part-Unit-2-lecture.pptx.pdf")
|
| 26 |
+
if sample_path.exists():
|
| 27 |
+
uploaded_file = open(sample_path, "rb")
|
| 28 |
+
st.caption("βΉοΈ Using default sample: Part-Unit-2-lecture.pptx.pdf")
|
| 29 |
+
else:
|
| 30 |
+
st.warning("Please upload a PDF file to begin.")
|
| 31 |
+
st.stop()
|
| 32 |
+
|
| 33 |
+
# User query
|
| 34 |
+
query = st.text_input("π Ask a question about the document (optional)", value="What is the main topic of the document?")
|
| 35 |
+
|
| 36 |
+
if st.button("π Analyze Document"):
|
| 37 |
+
with st.spinner("Analyzing... this may take a few seconds..."):
|
| 38 |
+
result = analyze_pdf(uploaded_file, query)
|
| 39 |
+
|
| 40 |
+
st.markdown("---")
|
| 41 |
+
st.subheader("π§ Named Entities")
|
| 42 |
+
col1, col2, col3 = st.columns(3)
|
| 43 |
+
col1.markdown("π€ People")
|
| 44 |
+
col1.write(result["entities"].get("people", []))
|
| 45 |
+
col2.markdown("π Places")
|
| 46 |
+
col2.write(result["entities"].get("places", []))
|
| 47 |
+
col3.markdown("π’ Organizations")
|
| 48 |
+
col3.write(result["entities"].get("organizations", []))
|
| 49 |
+
|
| 50 |
+
st.markdown("---")
|
| 51 |
+
st.subheader("π Relevant Chunks")
|
| 52 |
+
for i, chunk in enumerate(result["relevant_chunks"], 1):
|
| 53 |
+
st.markdown(f"{i}. {chunk}")
|
| 54 |
+
|
| 55 |
+
st.markdown("---")
|
| 56 |
+
st.subheader("π Summary")
|
| 57 |
+
for i, sentence in enumerate(result["summary"], 1):
|
| 58 |
+
st.markdown(f"{i}. {sentence}")
|
| 59 |
+
|
| 60 |
+
# Downloads
|
| 61 |
+
def get_binary_file_downloader_html(bin_data, filename, label):
|
| 62 |
+
b64 = base64.b64encode(bin_data).decode()
|
| 63 |
+
href = f'<a href="data:application/octet-stream;base64,{b64}" download="{filename}">{label}</a>'
|
| 64 |
+
return href
|
| 65 |
+
|
| 66 |
+
st.markdown("---")
|
| 67 |
+
st.subheader("β¬οΈ Download Summary")
|
| 68 |
+
|
| 69 |
+
# .txt
|
| 70 |
+
txt_bytes = "\n".join(result["summary"]).encode("utf-8")
|
| 71 |
+
st.markdown(get_binary_file_downloader_html(txt_bytes, "summary.txt", "π Download as .txt"), unsafe_allow_html=True)
|
| 72 |
+
|
| 73 |
+
# .pdf (optional)
|
| 74 |
+
from fpdf import FPDF
|
| 75 |
+
|
| 76 |
+
# Create PDF
|
| 77 |
+
pdf = FPDF()
|
| 78 |
+
pdf.add_page()
|
| 79 |
+
pdf.set_font("Arial", size=12)
|
| 80 |
+
for line in result["summary"]:
|
| 81 |
+
pdf.multi_cell(0, 10, line)
|
| 82 |
+
|
| 83 |
+
# Fix: use dest='S' and encode to bytes, then wrap in BytesIO
|
| 84 |
+
pdf_output = pdf.output(dest='S').encode('latin1')
|
| 85 |
+
pdf_buffer = BytesIO(pdf_output)
|
| 86 |
+
|
| 87 |
+
# Now use this to create downloadable content
|
| 88 |
+
st.markdown(get_binary_file_downloader_html(pdf_buffer.getvalue(), "summary.pdf", "π Download as .pdf"), unsafe_allow_html=True)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
st.markdown("---")
|
| 92 |
+
st.subheader("β Rate This App")
|
| 93 |
+
rating = st.radio("How satisfied are you with this analysis?", ["π‘ 1", "π 2", "π 3", "π 4", "π€© 5"])
|
| 94 |
+
feedback = st.text_area("π¬ Any feedback you'd like to share?")
|
| 95 |
+
if st.button("π© Submit Feedback"):
|
| 96 |
+
st.success("β
Thank you! Your response has been recorded.")
|