Spaces:
Running
Running
File size: 4,697 Bytes
87296cd ec18d9b 66c7ada 1848973 66c7ada bd6f8a7 cfe2de2 bd6f8a7 cfe2de2 bd6f8a7 87296cd bd6f8a7 87296cd bd6f8a7 87296cd 66c7ada 87296cd 66c7ada ec18d9b 87296cd 66c7ada 87296cd 66c7ada 1848973 87296cd 66c7ada 87296cd 66c7ada bd6f8a7 66c7ada 87296cd 66c7ada 87296cd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import os
from typing import TypedDict, List
from pydantic import BaseModel, Field
# --- Core LangChain & Document Processing Imports ---
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from core_utils.core_model_loaders import load_embedding_model, load_gemini_llm
from langgraph.graph import StateGraph, END
# --- Initialize Models ---
embedding_model = load_embedding_model()
llm = load_gemini_llm()
# --- 1. RAG Chain Logic ---
def create_rag_chain(retriever):
"""Creates a RAG chain for answering questions about the document."""
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = PromptTemplate.from_template(template)
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
return rag_chain
# --- 2. Demystifier Graph Logic ---
class DemystifierState(TypedDict):
document_chunks: List[str]
summary: str
key_terms: str
final_report: str
def summarize_document(state: DemystifierState):
"""Summarizes the provided document chunks."""
print("---NODE: Summarizing Document---")
text = "\n\n".join(state["document_chunks"])
# Truncate for safety if too large for prompt
text = text[:30000]
prompt = f"""
You are a legal expert. Summarize the following legal document content in simple, easy-to-understand language.
Focus on the main purpose and parties involved.
Content:
{text}
"""
response = llm.invoke(prompt)
return {"summary": response.content}
def extract_key_terms(state: DemystifierState):
"""Extracts and explains key legal terms."""
print("---NODE: Extracting Key Terms---")
text = "\n\n".join(state["document_chunks"])
text = text[:30000]
prompt = f"""
Identify 5-7 complex legal terms or clauses from the text below.
List them and explain what they mean in plain English for a layperson.
Content:
{text}
"""
response = llm.invoke(prompt)
return {"key_terms": response.content}
def generate_report(state: DemystifierState):
"""Compiles the final analysis report."""
print("---NODE: Generating Final Report---")
report = f"""
# Document Analysis
## 📝 Summary
{state['summary']}
## 🔑 Key Terms & Definitions
{state['key_terms']}
## 💡 Expert Advice
Always consult with a qualified lawyer for critical legal decisions. This analysis is AI-generated guidance.
"""
return {"final_report": report}
# --- Build the Graph ---
workflow = StateGraph(DemystifierState)
workflow.add_node("summarize", summarize_document)
workflow.add_node("extract_terms", extract_key_terms)
workflow.add_node("compile_report", generate_report)
# Parallel execution of summary and terms
workflow.set_entry_point("summarize")
workflow.add_edge("summarize", "extract_terms")
workflow.add_edge("extract_terms", "compile_report")
workflow.add_edge("compile_report", END)
demystifier_agent_graph = workflow.compile()
# --- 4. The Master "Controller" Function ---
def process_document_for_demystification(file_path: str):
"""Loads a PDF, runs the full analysis, creates a RAG chain, and returns both."""
print(f"--- Processing document: {file_path} ---")
loader = PyPDFLoader(file_path)
documents = loader.load()
if not documents:
raise ValueError("No content found in PDF.")
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(documents)
print("--- Creating FAISS vector store for Q&A ---")
vectorstore = FAISS.from_documents(chunks, embedding=embedding_model)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
rag_chain = create_rag_chain(retriever)
print("--- Running analysis graph for the report ---")
chunk_contents = [chunk.page_content for chunk in chunks]
# Limit context to avoid token limits if document is huge
graph_input = {"document_chunks": chunk_contents}
result = demystifier_agent_graph.invoke(graph_input)
report = result.get("final_report")
return {"report": report, "rag_chain": rag_chain} |