import streamlit as st import fitz # PyMuPDF import docx import nltk import spacy import networkx as nx import matplotlib.pyplot as plt from transformers import pipeline from collections import Counter import os # Ensure spaCy model is available try: nlp = spacy.load("en_core_web_lg") except OSError: os.system("python -m spacy download en_core_web_lg") nlp = spacy.load("en_core_web_lg") # Load NLP Models nltk.download("punkt") summarizer = pipeline("summarization", model="facebook/bart-large-cnn") qa_pipeline = pipeline("question-answering") # Function to extract text from PDF def extract_text_from_pdf(pdf_file): doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "\n".join([page.get_text("text") for page in doc]) return text # Function to extract text from DOCX def extract_text_from_docx(docx_file): doc = docx.Document(docx_file) text = "\n".join([para.text for para in doc.paragraphs]) return text # Summarization function def summarize_text(text): return summarizer(text, max_length=200, min_length=50, do_sample=False)[0]["summary_text"] # Q&A Function def answer_question(text, question): return qa_pipeline({"context": text, "question": question})["answer"] # Named Entity Recognition (NER) def extract_entities(text): doc = nlp(text) entities = [(ent.text, ent.label_) for ent in doc.ents] return entities # Generate Mind Map def generate_mind_map(text): doc = nlp(text) entity_counts = Counter([ent.text for ent in doc.ents]) G = nx.Graph() for entity, count in entity_counts.items(): G.add_node(entity, size=count * 100) pos = nx.spring_layout(G) plt.figure(figsize=(10, 7)) nx.draw(G, pos, with_labels=True, node_size=[G.nodes[n]['size'] for n in G.nodes], node_color="skyblue") plt.title("Mind Map of Entities") st.pyplot(plt) # Streamlit UI st.set_page_config(page_title="Legal Document Summarizer & Query System", layout="wide") st.title("📜 Legal Document Summarization, NER & Mind Map System") st.markdown("""Upload a legal document, get a summary, extract entities, and generate a mind map!""") # File uploader uploaded_file = st.file_uploader("Upload a PDF or DOCX", type=["pdf", "docx"]) if uploaded_file: if uploaded_file.type == "application/pdf": document_text = extract_text_from_pdf(uploaded_file) elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": document_text = extract_text_from_docx(uploaded_file) else: st.error("Unsupported file format!") st.stop() st.subheader("Extracted Text Preview") st.text_area("Document Content", document_text[:2000], height=250) # Summarization if st.button("Summarize Document"): summary = summarize_text(document_text) st.subheader("📌 Summary") st.success(summary) # Question Answering user_question = st.text_input("Ask a question about the document:") if user_question: answer = answer_question(document_text, user_question) st.subheader("📝 Answer") st.info(answer) # Named Entity Recognition if st.button("Extract Entities"): entities = extract_entities(document_text) st.subheader("📌 Named Entities") for entity, label in entities: st.write(f"**{entity}** - {label}") # Mind Map Generation if st.button("Generate Mind Map"): st.subheader("🧠 Mind Map of Entities") generate_mind_map(document_text) st.markdown("---") st.caption("🚀 Built with Hugging Face, spaCy, and Streamlit")