| import streamlit as st |
| import PyPDF2 |
| from docx import Document |
| import json |
| from google import genai |
| from dotenv import load_dotenv |
| import os |
| import re |
| import pandas as pd |
|
|
| |
| load_dotenv() |
| api_key = os.getenv("GEMINI_API_KEY") |
|
|
| if not api_key: |
| st.error("β Gemini API key not found. Please set GEMINI_API_KEY.") |
| st.stop() |
|
|
| |
| def extract_text_from_pdf(file): |
| reader = PyPDF2.PdfReader(file) |
| text = "" |
| for page in reader.pages: |
| content = page.extract_text() |
| if content: |
| text += content + "\n" |
| return text.strip() |
|
|
| |
| def extract_text_from_docx(file): |
| doc = Document(file) |
| return "\n".join([para.text for para in doc.paragraphs]).strip() |
|
|
| |
| def safe_parse_json(response_text): |
| try: |
| clean_text = re.sub(r"^```(?:json)?|```$", "", response_text.strip(), flags=re.MULTILINE) |
| return json.loads(clean_text) |
| except Exception as e: |
| st.error("β οΈ Could not parse Gemini response as JSON. Showing raw response.") |
| return { |
| "summary": response_text, |
| "highlights": None, |
| "glossary": None |
| } |
|
|
| |
| def call_gemini_api(document_text): |
| client = genai.Client(api_key=api_key) |
|
|
| prompt = ( |
| f"Analyze the following legal document:\n\n{document_text}\n\n" |
| "Instructions:\n" |
| "- Summarize the key points of the document.\n" |
| "- Highlight obligations, rights, and critical clauses (as a list of objects with 'clause' and 'description').\n" |
| "- Provide simplified explanations of complex legal terms (as a dictionary).\n" |
| "Return the result as JSON with keys: 'summary', 'highlights', 'glossary'." |
| ) |
|
|
| response = client.models.generate_content( |
| model="gemini-2.0-flash", |
| contents=prompt |
| ) |
|
|
| return safe_parse_json(response.text) |
|
|
| |
| def render_highlights(highlights): |
| if isinstance(highlights, list) and all(isinstance(item, dict) for item in highlights): |
| df = pd.DataFrame(highlights) |
| st.table(df) |
| elif isinstance(highlights, str): |
| st.markdown(highlights) |
| else: |
| st.info("No highlights available.") |
|
|
| |
| def render_glossary(glossary): |
| if isinstance(glossary, dict): |
| glossary_list = [{"Term": term, "Explanation": explanation} for term, explanation in glossary.items()] |
| df = pd.DataFrame(glossary_list) |
| st.table(df) |
| elif isinstance(glossary, str): |
| st.markdown(glossary) |
| else: |
| st.info("No glossary available.") |
|
|
| |
| def main(): |
| st.set_page_config(page_title="Legal Document Summarizer", layout="wide") |
| st.title("π Legal Document Summarizer") |
| st.caption("Upload a legal document (PDF or DOCX) to get a summary, key highlights, and glossary of legal terms.") |
|
|
| uploaded_file = st.file_uploader("Upload your document", type=["pdf", "docx"]) |
|
|
| if uploaded_file: |
| if uploaded_file.type == "application/pdf": |
| document_text = extract_text_from_pdf(uploaded_file) |
| elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": |
| document_text = extract_text_from_docx(uploaded_file) |
| else: |
| st.error("Unsupported file format.") |
| return |
|
|
| if not document_text.strip(): |
| st.error("No text extracted from the document.") |
| return |
|
|
| st.subheader("π Document Preview") |
| st.text_area("Extracted Text", document_text, height=300) |
|
|
| if st.button("Summarize Document"): |
| with st.spinner("Calling Gemini..."): |
| result = call_gemini_api(document_text) |
|
|
| st.subheader("π Summary") |
| st.write(result.get("summary", "No summary found.")) |
|
|
| st.subheader("π Highlights") |
| render_highlights(result.get("highlights")) |
|
|
| st.subheader("π Glossary") |
| render_glossary(result.get("glossary")) |
|
|
| if __name__ == "__main__": |
| main() |
|
|