arya123321 commited on
Commit
eec757a
·
1 Parent(s): bd1e4b7

initial commit

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. Dockerfile +15 -0
  3. chatbot.py +253 -0
  4. requirements.txt +0 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python image
2
+ FROM python:3.9
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Copy dependencies first for caching
8
+ COPY --chown=user ./requirements.txt requirements.txt
9
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
10
+
11
+ # Copy the rest of the application files
12
+ COPY --chown=user . /app
13
+
14
+ # Use uvicorn as recommended for Spaces
15
+ CMD ["uvicorn", "chatbot:app", "--host", "0.0.0.0", "--port", "7860"]
chatbot.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from llama_index.core import VectorStoreIndex
3
+ from llama_index.core import Settings
4
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
5
+ from llama_index.llms.groq import Groq
6
+ from llama_index.vector_stores.pinecone import PineconeVectorStore
7
+ from pinecone import Pinecone
8
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
9
+ from PyPDF2 import PdfReader
10
+ from flask_cors import CORS
11
+ from functools import wraps
12
+ from dotenv import load_dotenv
13
+ from huggingface_hub import InferenceClient
14
+ import re, torch, jwt, os, json, gc
15
+
16
+ load_dotenv()
17
+
18
+ SECRET_KEY = os.getenv("SECRET_KEY")
19
+
20
+ # Initialize Hugging Face Inference Client for embeddings
21
+ client = InferenceClient(
22
+ provider="hf-inference",
23
+ api_key=os.getenv("HF_API_KEY") # Add your Hugging Face API key to .env
24
+ )
25
+
26
+ # Load summarization model and tokenizer
27
+ model_path = "Jurisight/legal_led"
28
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_path,use_auth_token=os.getenv("HF_API_KEY"))
29
+ tokenizer = AutoTokenizer.from_pretrained(model_path,use_auth_token=os.getenv("HF_API_KEY"))
30
+
31
+ # Configure LlamaIndex settings
32
+ Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
33
+ Settings.llm = Groq(model="llama3-8b-8192", api_key=os.getenv("GROQ_API_KEY"))
34
+
35
+ # Initialize Pinecone
36
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
37
+ pinecone_index_chat = "llamaindex"
38
+ pinecone_index_retrieval = "judgment-search"
39
+
40
+ app = Flask(__name__)
41
+ CORS(app)
42
+
43
+ # Authentication decorator
44
+ def authenticate_user(f):
45
+ @wraps(f)
46
+ def decorated_function(*args, **kwargs):
47
+ token = request.headers.get("x-auth-token")
48
+ if not token:
49
+ return jsonify({"error": "Authentication token is missing"}), 401
50
+ try:
51
+ decoded_token = jwt.decode(token, SECRET_KEY, algorithms=["HS256"])
52
+ user_id = decoded_token["id"]
53
+ if not user_id:
54
+ return jsonify({"error": "Invalid token structure"}), 401
55
+ except jwt.ExpiredSignatureError:
56
+ return jsonify({"error": "Token has expired"}), 401
57
+ except jwt.InvalidTokenError:
58
+ return jsonify({"error": "Invalid token"}), 401
59
+ return f(user_id, *args, **kwargs)
60
+ return decorated_function
61
+
62
+ # System prompt for the chatbot
63
+ SYSTEM_PROMPT = (
64
+ "You are Jurisight, a highly knowledgeable legal chatbot. Your purpose is to assist "
65
+ "users with questions related to legal documents, laws, judgments, and legal topics. "
66
+ "Do not answer questions unrelated to the legal domain. Provide accurate and concise "
67
+ "legal responses based on your training and knowledge.\n\n"
68
+ "If the user has uploaded a document, consider only the most recently uploaded document "
69
+ "and its generated summary in your responses. Forget any previous documents or summaries "
70
+ "when a new one is uploaded. If no document has been uploaded, do not assume otherwise.\n\n"
71
+ "Maintain continuity by considering the chat history. If a user follows up on a previous question, "
72
+ "use the past interactions for context rather than responding in isolation. However, "
73
+ "do not reference any document unless one is currently available."
74
+ )
75
+
76
+ # Global storage for document text, summaries, and chat history
77
+ document_text_storage = {}
78
+ summarized_content = {}
79
+ context_text = ""
80
+ chat_history = {}
81
+
82
+ # Function to extract entities from text
83
+ def extract_entities(text):
84
+ llm = Groq(model="llama3-8b-8192", api_key=os.getenv("GROQ_API_KEY"))
85
+ prompt = f"""
86
+ Read the following legal document and extract structured data in valid JSON format.
87
+ If some values are missing, **generate a concise 50-100 word summary** based on the document’s context.
88
+
89
+ Ensure the following fields are always present:
90
+ - "Client Name": Extract or infer the client's full name.
91
+ - "Gender": Identify if explicitly mentioned; otherwise, infer based on name.
92
+ - "Matter": Identify the case type or legal matter.
93
+ - "Client Objectives": Summarize the client's main objective.
94
+ - "Custody Status": Extract whether the petitioner is in custody (Yes/No).
95
+ - "Crime Registered": Indicate whether a crime has been registered (Yes/No).
96
+ - "Application Filing": Indicate whether an application has been filed (Yes/No).
97
+ - "Legal Analysis.Prayer Details": Summarize the relief sought in 50-100 words.
98
+ - "Legal Analysis.Interim Relief Details": Summarize any interim relief in 50-100 words.
99
+ - "Legal Analysis.Grounds": Extract or infer legal grounds in 50-100 words.
100
+
101
+ Return only a valid JSON object without any extra text.
102
+
103
+ Document:
104
+ {text}
105
+ """
106
+ try:
107
+ response = llm.complete(prompt)
108
+ extracted_text = response.text.strip()
109
+ json_start = extracted_text.find("{")
110
+ json_end = extracted_text.rfind("}") + 1
111
+ json_data = extracted_text[json_start:json_end]
112
+ return json.loads(json_data)
113
+ except json.JSONDecodeError:
114
+ return {}
115
+ except AttributeError:
116
+ return {}
117
+
118
+ # Chat endpoint
119
+ @app.route('/chat', methods=['POST'])
120
+ @authenticate_user
121
+ def chat(user_id):
122
+ global context_text, chat_history
123
+ try:
124
+ if not request.json or 'message' not in request.json:
125
+ return jsonify({"error": "Invalid request format"}), 400
126
+
127
+ user_message = request.json['message']
128
+ document_text = document_text_storage.get(user_id, "")
129
+ summary_text = summarized_content.get(user_id, "")
130
+ if document_text and "Document Context:" not in context_text:
131
+ context_text += f"Document Context:\n{document_text}\n\n"
132
+ if summary_text and "Summarized Content:" not in context_text:
133
+ context_text += f"Summarized Content:\n{summary_text}\n\n"
134
+ chat_memory = "\n".join(chat_history.get(user_id, [])[-10:])
135
+ formatted_message = f"{SYSTEM_PROMPT}\n\nDocument Context:\n{document_text}\n\nSummarized Content:\n{summary_text}\n\nChat History:\n{chat_memory}\nUser: {user_message}\nJurisight:"
136
+ pinecone_index = pc.Index(pinecone_index_chat)
137
+ vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
138
+ index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
139
+ query_engine = index.as_query_engine()
140
+ response = query_engine.query(formatted_message)
141
+ chat_history.setdefault(user_id, []).append(f"User: {user_message}")
142
+ chat_history[user_id].append(f"Jurisight: {response}")
143
+ response = {"response": f"{response}"}
144
+ return jsonify(response), 200
145
+ except Exception as e:
146
+ return jsonify({"error": "Internal server error"}), 500
147
+
148
+ # Summarize endpoint
149
+ @app.route('/summarize', methods=['POST'])
150
+ @authenticate_user
151
+ def summarize(user_id):
152
+ def clean_text(text):
153
+ cleaned_text = re.sub(r'\s+', ' ', text).strip()
154
+ return cleaned_text
155
+
156
+ def summarize_legal_document(document_text, chunk_size=1024, max_output_length=128):
157
+ try:
158
+ chunks = [document_text[i:i+chunk_size] for i in range(0, len(document_text), chunk_size)]
159
+ summaries = []
160
+ for chunk in chunks:
161
+ inputs = tokenizer(
162
+ chunk,
163
+ max_length=chunk_size,
164
+ padding="max_length",
165
+ truncation=True,
166
+ return_tensors="pt"
167
+ )
168
+ summary_ids = model.generate(
169
+ inputs["input_ids"],
170
+ num_beams=4,
171
+ max_length=max_output_length,
172
+ early_stopping=True
173
+ )
174
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()
175
+ summaries.append(summary)
176
+ return " ".join(summaries)
177
+ except Exception as e:
178
+ raise
179
+
180
+ if 'file' not in request.files:
181
+ return jsonify({"error": "No file provided"}), 400
182
+
183
+ file = request.files['file']
184
+ if file.filename == '':
185
+ return jsonify({"error": "Empty file uploaded"}), 400
186
+
187
+ try:
188
+ reader = PdfReader(file)
189
+ document_text = ""
190
+ for page in reader.pages:
191
+ text = page.extract_text()
192
+ if text:
193
+ document_text += text.strip() + " "
194
+
195
+ document_text = clean_text(document_text)
196
+ if not document_text or len(document_text.split()) < 10:
197
+ return jsonify({"error": "The document does not contain sufficient readable text."}), 400
198
+
199
+ document_text_storage[user_id] = document_text
200
+ summary = summarize_legal_document(document_text)
201
+ summarized_content[user_id] = summary
202
+ return jsonify({"summary": summary}), 200
203
+ except Exception as e:
204
+ return jsonify({"error": "Error processing the file"}), 500
205
+
206
+ # Retrieve cases endpoint
207
+ @app.route('/retrieve-cases', methods=['POST'])
208
+ @authenticate_user
209
+ def retrieve_cases(user_id):
210
+ def generate_embedding(text):
211
+ # Use Hugging Face Inference API for embeddings
212
+ result = client.feature_extraction(
213
+ model="BAAI/bge-base-en-v1.5",
214
+ inputs=text,
215
+ provider="hf-inference",
216
+ )
217
+ return result
218
+
219
+ def query_pinecone(query_text, top_k=10):
220
+ query_embedding = generate_embedding(query_text)
221
+ retrieval_index = pc.Index(pinecone_index_retrieval)
222
+ results = retrieval_index.query(vector=query_embedding.tolist(), top_k=top_k, include_metadata=True)
223
+ return results
224
+
225
+ if not request.json:
226
+ return jsonify({"error": "No file or query provided"}), 400
227
+
228
+ document_text = document_text_storage.get(user_id, None)
229
+ if not document_text:
230
+ return jsonify({"error": "No document available for retrieval"}), 400
231
+
232
+ try:
233
+ top_k = request.json.get('top_k', 10)
234
+ results = query_pinecone(document_text, top_k=top_k)
235
+ if not results['matches']:
236
+ return jsonify({"error": "No relevant cases found."}), 200
237
+ case_links = [{"score": result['score'], "url": result['metadata']['url']} for result in results['matches']]
238
+ return jsonify({"case_links": case_links}), 200
239
+ except Exception as e:
240
+ return jsonify({"error": "Error processing the file"}), 500
241
+
242
+ # Fetch form data endpoint
243
+ @app.route('/fetch-form-data', methods=['GET'])
244
+ @authenticate_user
245
+ def fetch_form_data(user_id):
246
+ if user_id not in document_text_storage:
247
+ return jsonify({"error": "No document found"}), 400
248
+ extracted_data = extract_entities(document_text_storage[user_id])
249
+ return jsonify(extracted_data), 200
250
+
251
+ # Run the app
252
+ if __name__ == '__main__':
253
+ app.run(debug=True, host='0.0.0.0', port=7860)
requirements.txt ADDED
Binary file (4.74 kB). View file