Soundaryasos's picture
Update app.py
07cbcbe verified
import streamlit as st
import fitz # PyMuPDF
import docx
import nltk
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from transformers import pipeline
from collections import Counter
import os
# Ensure spaCy model is available
try:
nlp = spacy.load("en_core_web_lg")
except OSError:
os.system("python -m spacy download en_core_web_lg")
nlp = spacy.load("en_core_web_lg")
# Load NLP Models
nltk.download("punkt")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
qa_pipeline = pipeline("question-answering")
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = "\n".join([page.get_text("text") for page in doc])
return text
# Function to extract text from DOCX
def extract_text_from_docx(docx_file):
doc = docx.Document(docx_file)
text = "\n".join([para.text for para in doc.paragraphs])
return text
# Summarization function
def summarize_text(text):
return summarizer(text, max_length=200, min_length=50, do_sample=False)[0]["summary_text"]
# Q&A Function
def answer_question(text, question):
return qa_pipeline({"context": text, "question": question})["answer"]
# Named Entity Recognition (NER)
def extract_entities(text):
doc = nlp(text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
return entities
# Generate Mind Map
def generate_mind_map(text):
doc = nlp(text)
entity_counts = Counter([ent.text for ent in doc.ents])
G = nx.Graph()
for entity, count in entity_counts.items():
G.add_node(entity, size=count * 100)
pos = nx.spring_layout(G)
plt.figure(figsize=(10, 7))
nx.draw(G, pos, with_labels=True, node_size=[G.nodes[n]['size'] for n in G.nodes], node_color="skyblue")
plt.title("Mind Map of Entities")
st.pyplot(plt)
# Streamlit UI
st.set_page_config(page_title="Legal Document Summarizer & Query System", layout="wide")
st.title("πŸ“œ Legal Document Summarization, NER & Mind Map System")
st.markdown("""Upload a legal document, get a summary, extract entities, and generate a mind map!""")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF or DOCX", type=["pdf", "docx"])
if uploaded_file:
if uploaded_file.type == "application/pdf":
document_text = extract_text_from_pdf(uploaded_file)
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
document_text = extract_text_from_docx(uploaded_file)
else:
st.error("Unsupported file format!")
st.stop()
st.subheader("Extracted Text Preview")
st.text_area("Document Content", document_text[:2000], height=250)
# Summarization
if st.button("Summarize Document"):
summary = summarize_text(document_text)
st.subheader("πŸ“Œ Summary")
st.success(summary)
# Question Answering
user_question = st.text_input("Ask a question about the document:")
if user_question:
answer = answer_question(document_text, user_question)
st.subheader("πŸ“ Answer")
st.info(answer)
# Named Entity Recognition
if st.button("Extract Entities"):
entities = extract_entities(document_text)
st.subheader("πŸ“Œ Named Entities")
for entity, label in entities:
st.write(f"**{entity}** - {label}")
# Mind Map Generation
if st.button("Generate Mind Map"):
st.subheader("🧠 Mind Map of Entities")
generate_mind_map(document_text)
st.markdown("---")
st.caption("πŸš€ Built with Hugging Face, spaCy, and Streamlit")