Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import fitz # PyMuPDF | |
| import docx | |
| import nltk | |
| import spacy | |
| import networkx as nx | |
| import matplotlib.pyplot as plt | |
| from transformers import pipeline | |
| from collections import Counter | |
| import os | |
| # Ensure spaCy model is available | |
| try: | |
| nlp = spacy.load("en_core_web_lg") | |
| except OSError: | |
| os.system("python -m spacy download en_core_web_lg") | |
| nlp = spacy.load("en_core_web_lg") | |
| # Load NLP Models | |
| nltk.download("punkt") | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| qa_pipeline = pipeline("question-answering") | |
| # Function to extract text from PDF | |
| def extract_text_from_pdf(pdf_file): | |
| doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
| text = "\n".join([page.get_text("text") for page in doc]) | |
| return text | |
| # Function to extract text from DOCX | |
| def extract_text_from_docx(docx_file): | |
| doc = docx.Document(docx_file) | |
| text = "\n".join([para.text for para in doc.paragraphs]) | |
| return text | |
| # Summarization function | |
| def summarize_text(text): | |
| return summarizer(text, max_length=200, min_length=50, do_sample=False)[0]["summary_text"] | |
| # Q&A Function | |
| def answer_question(text, question): | |
| return qa_pipeline({"context": text, "question": question})["answer"] | |
| # Named Entity Recognition (NER) | |
| def extract_entities(text): | |
| doc = nlp(text) | |
| entities = [(ent.text, ent.label_) for ent in doc.ents] | |
| return entities | |
| # Generate Mind Map | |
| def generate_mind_map(text): | |
| doc = nlp(text) | |
| entity_counts = Counter([ent.text for ent in doc.ents]) | |
| G = nx.Graph() | |
| for entity, count in entity_counts.items(): | |
| G.add_node(entity, size=count * 100) | |
| pos = nx.spring_layout(G) | |
| plt.figure(figsize=(10, 7)) | |
| nx.draw(G, pos, with_labels=True, node_size=[G.nodes[n]['size'] for n in G.nodes], node_color="skyblue") | |
| plt.title("Mind Map of Entities") | |
| st.pyplot(plt) | |
| # Streamlit UI | |
| st.set_page_config(page_title="Legal Document Summarizer & Query System", layout="wide") | |
| st.title("π Legal Document Summarization, NER & Mind Map System") | |
| st.markdown("""Upload a legal document, get a summary, extract entities, and generate a mind map!""") | |
| # File uploader | |
| uploaded_file = st.file_uploader("Upload a PDF or DOCX", type=["pdf", "docx"]) | |
| if uploaded_file: | |
| if uploaded_file.type == "application/pdf": | |
| document_text = extract_text_from_pdf(uploaded_file) | |
| elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
| document_text = extract_text_from_docx(uploaded_file) | |
| else: | |
| st.error("Unsupported file format!") | |
| st.stop() | |
| st.subheader("Extracted Text Preview") | |
| st.text_area("Document Content", document_text[:2000], height=250) | |
| # Summarization | |
| if st.button("Summarize Document"): | |
| summary = summarize_text(document_text) | |
| st.subheader("π Summary") | |
| st.success(summary) | |
| # Question Answering | |
| user_question = st.text_input("Ask a question about the document:") | |
| if user_question: | |
| answer = answer_question(document_text, user_question) | |
| st.subheader("π Answer") | |
| st.info(answer) | |
| # Named Entity Recognition | |
| if st.button("Extract Entities"): | |
| entities = extract_entities(document_text) | |
| st.subheader("π Named Entities") | |
| for entity, label in entities: | |
| st.write(f"**{entity}** - {label}") | |
| # Mind Map Generation | |
| if st.button("Generate Mind Map"): | |
| st.subheader("π§ Mind Map of Entities") | |
| generate_mind_map(document_text) | |
| st.markdown("---") | |
| st.caption("π Built with Hugging Face, spaCy, and Streamlit") | |