Spaces:
Running
Running
| import os | |
| from typing import TypedDict, List | |
| from pydantic import BaseModel, Field | |
| # --- Core LangChain & Document Processing Imports --- | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.runnables import RunnablePassthrough | |
| from langchain_core.output_parsers import StrOutputParser | |
| from core_utils.core_model_loaders import load_embedding_model, load_gemini_llm | |
| from langgraph.graph import StateGraph, END | |
| # --- Initialize Models --- | |
| embedding_model = load_embedding_model() | |
| llm = load_gemini_llm() | |
| # --- 1. RAG Chain Logic --- | |
| def create_rag_chain(retriever): | |
| """Creates a RAG chain for answering questions about the document.""" | |
| template = """Answer the question based only on the following context: | |
| {context} | |
| Question: {question} | |
| """ | |
| prompt = PromptTemplate.from_template(template) | |
| def format_docs(docs): | |
| return "\n\n".join(doc.page_content for doc in docs) | |
| rag_chain = ( | |
| {"context": retriever | format_docs, "question": RunnablePassthrough()} | |
| | prompt | |
| | llm | |
| | StrOutputParser() | |
| ) | |
| return rag_chain | |
| # --- 2. Demystifier Graph Logic --- | |
| class DemystifierState(TypedDict): | |
| document_chunks: List[str] | |
| summary: str | |
| key_terms: str | |
| final_report: str | |
| def summarize_document(state: DemystifierState): | |
| """Summarizes the provided document chunks.""" | |
| print("---NODE: Summarizing Document---") | |
| text = "\n\n".join(state["document_chunks"]) | |
| # Truncate for safety if too large for prompt | |
| text = text[:30000] | |
| prompt = f""" | |
| You are a legal expert. Summarize the following legal document content in simple, easy-to-understand language. | |
| Focus on the main purpose and parties involved. | |
| Content: | |
| {text} | |
| """ | |
| response = llm.invoke(prompt) | |
| return {"summary": response.content} | |
| def extract_key_terms(state: DemystifierState): | |
| """Extracts and explains key legal terms.""" | |
| print("---NODE: Extracting Key Terms---") | |
| text = "\n\n".join(state["document_chunks"]) | |
| text = text[:30000] | |
| prompt = f""" | |
| Identify 5-7 complex legal terms or clauses from the text below. | |
| List them and explain what they mean in plain English for a layperson. | |
| Content: | |
| {text} | |
| """ | |
| response = llm.invoke(prompt) | |
| return {"key_terms": response.content} | |
| def generate_report(state: DemystifierState): | |
| """Compiles the final analysis report.""" | |
| print("---NODE: Generating Final Report---") | |
| report = f""" | |
| # Document Analysis | |
| ## 📝 Summary | |
| {state['summary']} | |
| ## 🔑 Key Terms & Definitions | |
| {state['key_terms']} | |
| ## 💡 Expert Advice | |
| Always consult with a qualified lawyer for critical legal decisions. This analysis is AI-generated guidance. | |
| """ | |
| return {"final_report": report} | |
| # --- Build the Graph --- | |
| workflow = StateGraph(DemystifierState) | |
| workflow.add_node("summarize", summarize_document) | |
| workflow.add_node("extract_terms", extract_key_terms) | |
| workflow.add_node("compile_report", generate_report) | |
| # Parallel execution of summary and terms | |
| workflow.set_entry_point("summarize") | |
| workflow.add_edge("summarize", "extract_terms") | |
| workflow.add_edge("extract_terms", "compile_report") | |
| workflow.add_edge("compile_report", END) | |
| demystifier_agent_graph = workflow.compile() | |
| # --- 4. The Master "Controller" Function --- | |
| def process_document_for_demystification(file_path: str): | |
| """Loads a PDF, runs the full analysis, creates a RAG chain, and returns both.""" | |
| print(f"--- Processing document: {file_path} ---") | |
| loader = PyPDFLoader(file_path) | |
| documents = loader.load() | |
| if not documents: | |
| raise ValueError("No content found in PDF.") | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| chunks = splitter.split_documents(documents) | |
| print("--- Creating FAISS vector store for Q&A ---") | |
| vectorstore = FAISS.from_documents(chunks, embedding=embedding_model) | |
| retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) | |
| rag_chain = create_rag_chain(retriever) | |
| print("--- Running analysis graph for the report ---") | |
| chunk_contents = [chunk.page_content for chunk in chunks] | |
| # Limit context to avoid token limits if document is huge | |
| graph_input = {"document_chunks": chunk_contents} | |
| result = demystifier_agent_graph.invoke(graph_input) | |
| report = result.get("final_report") | |
| return {"report": report, "rag_chain": rag_chain} |