Spaces:
Sleeping
Sleeping
File size: 6,441 Bytes
8d17c17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | import os
import json
import zipfile
import json_repair
from docxtpl import DocxTemplate
from openai import OpenAI
from datetime import datetime
import chromadb
from sentence_transformers import SentenceTransformer
API_KEY = os.getenv("DEEPSEEK_API_KEY")
BASE_URL = "https://api.deepseek.com"
TEMPLATES_DIR = "tagged_templates"
DOWNLOADS_DIR = "downloads"
REGISTRY_FILE = "templates_registry.json"
TAGS_DB_FILE = "tags_db.json"
DB_PATH = "./legal_db"
PROMPTS = {
"router": """
You are a Legal Document Dispatcher. Your goal is to identify the most suitable document template from the list below based on the user's request.
AVAILABLE TEMPLATES:
{docs_list}
INSTRUCTION:
Return ONLY a JSON object: {{"filename": "exact_name.docx"}}
If no suitable template is found, return: {{"filename": null}}
""",
"ner_extractor": """
You are a Legal Data Extraction specialist. Your task is to extract entity information from the user's query into a structured JSON format.
DATE FORMAT: dd.mm.yyyy
REQUIRED SCHEMA:
{schema}
""",
"consultant": """
You are LexGuard AI, a professional legal assistant specializing in EU Law and GDPR.
Provide accurate, structured, and formal legal advice based on the provided context.
GUIDELINES:
1. CITATIONS: Always mention specific GDPR Articles or Recitals if they are present in the context.
2. LIMITATIONS: If the context doesn't contain the answer, use your general knowledge of EU Law but clearly state it is general information.
3. STRUCTURE: Use Markdown (bolding, bullet points) for clarity.
4. TONE: Professional, objective, and helpful.
GDPR DATABASE CONTEXT:
{context}
"""
}
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
collection = None
encoder = None
try:
encoder = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
chroma_client = chromadb.PersistentClient(path=DB_PATH)
collection = chroma_client.get_collection(name="laws")
print("✅ ChromaDB and Encoder initialized")
except Exception as e:
print(f"⚠️ RAG initialization error: {e}")
try:
with open(REGISTRY_FILE, "r", encoding="utf-8") as f:
registry = json.load(f)
with open(TAGS_DB_FILE, "r", encoding="utf-8") as f:
tags_db = json.load(f)
clean_tags_db = {k: v for k, v in tags_db.items() if not k.startswith("_")}
except Exception as e:
print(f"⚠️ Config files loading error: {e}")
registry, clean_tags_db = [], {}
async def select_best_template(user_query):
"""Identifies the best document template using LLM reasoning."""
docs_list = "\n".join([f"- {item['filename']} ({item.get('description', '')})" for item in registry])
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": PROMPTS["router"].format(docs_list=docs_list)},
{"role": "user", "content": user_query}
],
response_format={"type": "json_object"},
temperature=0.0
)
result = json_repair.loads(response.choices[0].message.content)
return result.get("filename")
except Exception as e:
print(f"⚠️ Router Error: {e}")
return None
async def extract_data_from_chat(user_query, filename):
"""Extracts required data fields for the document."""
schema = "\n".join([f"- {v['tag']}: {v['description']}" for k, v in clean_tags_db.items()])
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": PROMPTS["ner_extractor"].format(schema=schema)},
{"role": "user", "content": user_query}
],
response_format={"type": "json_object"},
temperature=0.1
)
return json_repair.loads(response.choices[0].message.content)
except Exception as e:
print(f"⚠️ Extraction Error: {e}")
return {}
async def consult_logic(user_text):
"""Core RAG consultation logic."""
context = "No specific articles found in the database."
# RAG: Retrieve context from ChromaDB
if collection and encoder:
try:
vec = encoder.encode(user_text).tolist()
res = collection.query(query_embeddings=[vec], n_results=3)
if res['documents'] and res['documents'][0]:
context = "\n---\n".join(res['documents'][0])
except Exception as e:
print(f"⚠️ Vector Search Error: {e}")
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": PROMPTS["consultant"].format(context=context)},
{"role": "user", "content": f"User Question: {user_text}"}
],
temperature=0.3
)
return {"type": "text", "content": response.choices[0].message.content}
except Exception as e:
return {"type": "text", "content": f"⚠️ Connection Error: {str(e)}"}
async def generate_doc_logic(user_text):
"""Handles the document generation pipeline (Currently in development)."""
best_filename = await select_best_template(user_text)
if not best_filename:
fallback = await consult_logic(f"Draft a response for: {user_text}")
fallback["content"] = "⚠️ **No matching template found.** Here is a manual draft:\n\n" + fallback["content"]
return fallback
template_path = os.path.join(TEMPLATES_DIR, best_filename)
if not os.path.exists(template_path):
return {"type": "text", "content": f"⚠️ Template file '{best_filename}' not found on server."}
data = await extract_data_from_chat(user_text, best_filename)
if "doc_date" not in data: data["doc_date"] = datetime.now().strftime("%d.%m.%Y")
try:
doc = DocxTemplate(template_path)
doc.render(data)
os.makedirs(DOWNLOADS_DIR, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
out_name = f"LexGuard_{ts}_{best_filename}"
out_path = os.path.join(DOWNLOADS_DIR, out_name)
doc.save(out_path)
return {
"type": "file",
"content": f"✅ Document successfully generated using template: **{best_filename}**",
"file_url": out_path
}
except Exception as e:
return {"type": "text", "content": f"⚠️ Generation error: {e}"} |