omm7 commited on
Commit
8ea81be
·
verified ·
1 Parent(s): b208056

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +20 -23
app.py CHANGED
@@ -7,34 +7,29 @@ import numpy as np
7
  from pathlib import Path
8
  from sentence_transformers import SentenceTransformer
9
  from huggingface_hub import CommitScheduler
10
- from openai import OpenAI
11
  from chromadb.errors import NotFoundError
 
12
 
13
  # Load embedding model
14
  embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
15
 
16
- # Load ChromaDB client and try to get the collection
17
  chroma_client = chromadb.PersistentClient(path="./clause_index")
18
  try:
19
  collection = chroma_client.get_collection("legal_clauses")
20
  except NotFoundError:
21
  collection = None
22
 
23
- # Setup OpenAI client
24
  client = OpenAI(
25
  base_url="https://router.huggingface.co/featherless-ai/v1",
26
  api_key=os.getenv("HF_TOKEN"),
27
  )
28
 
29
  # Prompt template
30
- system_message = """You are a legal AI assistant trained on contract clause examples from the CUAD dataset.
31
- Your task is to answer questions using relevant clauses from contract documents.
32
- If no relevant clause is retrieved, infer the answer using your legal reasoning based on common contractual standards."""
33
-
34
  user_template = """
35
-
36
-
37
-
38
  ### Context:
39
  {context}
40
 
@@ -42,7 +37,7 @@ user_template = """
42
  {question}
43
  """
44
 
45
- # Setup query logging
46
  log_file = Path("logs/") / f"query_{uuid.uuid4()}.json"
47
  log_file.parent.mkdir(exist_ok=True)
48
  scheduler = CommitScheduler(
@@ -53,12 +48,16 @@ scheduler = CommitScheduler(
53
  every=2
54
  )
55
 
56
- # Main predict function
57
  def predict(question):
58
  try:
 
59
  query_embedding = embed_model.encode([question], normalize_embeddings=True)[0]
 
 
60
  context = "No relevant clauses were found in the database. Please answer using your legal understanding from the CUAD dataset."
61
 
 
62
  if collection:
63
  try:
64
  results = collection.query(
@@ -69,21 +68,20 @@ def predict(question):
69
  metadatas = results["metadatas"][0]
70
 
71
  if documents:
72
- context_parts = [
73
  f"[Clause Type: {m['clause_type']}] {doc}"
74
  for doc, m in zip(documents, metadatas)
75
- ]
76
- context = "\n\n".join(context_parts)
77
-
78
- except Exception as e:
79
- # Log internal error, but let LLM proceed with generic context
80
  context = "Due to an internal retrieval issue, please answer based on your legal knowledge from CUAD dataset."
81
 
 
82
  prompt = [
83
  {"role": "system", "content": system_message},
84
  {"role": "user", "content": user_template.format(context=context, question=question)}
85
  ]
86
 
 
87
  stream = client.chat.completions.create(
88
  model="mistralai/Mistral-7B-Instruct-v0.2",
89
  messages=prompt,
@@ -94,13 +92,12 @@ def predict(question):
94
 
95
  output = ""
96
  for chunk in stream:
97
- delta = chunk.choices[0].delta.content or ""
98
- output += delta
99
 
100
  except Exception as e:
101
  output = f"An internal error occurred while generating the response: {str(e)}"
102
 
103
- # Log the interaction
104
  with scheduler.lock:
105
  with log_file.open("a") as f:
106
  f.write(json.dumps({
@@ -111,13 +108,13 @@ def predict(question):
111
 
112
  return output
113
 
114
- # Launch Gradio app
115
  demo = gr.Interface(
116
  fn=predict,
117
  inputs=gr.Textbox(label="Enter your legal question:", lines=4),
118
  outputs=gr.Textbox(label="Answer"),
119
  title="⚖️ GL_LegalMind",
120
- description="Ask legal contract-related questions. Answers are based on ChromaDB if available or inferred using CUAD-based legal knowledge."
121
  )
122
 
123
  demo.queue()
 
7
  from pathlib import Path
8
  from sentence_transformers import SentenceTransformer
9
  from huggingface_hub import CommitScheduler
 
10
  from chromadb.errors import NotFoundError
11
+ from openai import OpenAI
12
 
13
  # Load embedding model
14
  embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
15
 
16
+ # Load ChromaDB client
17
  chroma_client = chromadb.PersistentClient(path="./clause_index")
18
  try:
19
  collection = chroma_client.get_collection("legal_clauses")
20
  except NotFoundError:
21
  collection = None
22
 
23
+ # Setup OpenAI/Hugging Face client
24
  client = OpenAI(
25
  base_url="https://router.huggingface.co/featherless-ai/v1",
26
  api_key=os.getenv("HF_TOKEN"),
27
  )
28
 
29
  # Prompt template
30
+ system_message = """You are a legal AI assistant trained on contract clause examples from the CUAD dataset.
31
+ If no clauses are retrieved from the database, infer the answer using your understanding of common contractual standards. and report that no clause retrieved"""
 
 
32
  user_template = """
 
 
 
33
  ### Context:
34
  {context}
35
 
 
37
  {question}
38
  """
39
 
40
+ # Setup logging
41
  log_file = Path("logs/") / f"query_{uuid.uuid4()}.json"
42
  log_file.parent.mkdir(exist_ok=True)
43
  scheduler = CommitScheduler(
 
48
  every=2
49
  )
50
 
51
+ # Main QA function
52
  def predict(question):
53
  try:
54
+ # Encode query
55
  query_embedding = embed_model.encode([question], normalize_embeddings=True)[0]
56
+
57
+ # Default fallback context
58
  context = "No relevant clauses were found in the database. Please answer using your legal understanding from the CUAD dataset."
59
 
60
+ # If collection exists, try retrieval
61
  if collection:
62
  try:
63
  results = collection.query(
 
68
  metadatas = results["metadatas"][0]
69
 
70
  if documents:
71
+ context = "\n\n".join(
72
  f"[Clause Type: {m['clause_type']}] {doc}"
73
  for doc, m in zip(documents, metadatas)
74
+ )
75
+ except Exception:
 
 
 
76
  context = "Due to an internal retrieval issue, please answer based on your legal knowledge from CUAD dataset."
77
 
78
+ # Construct prompt
79
  prompt = [
80
  {"role": "system", "content": system_message},
81
  {"role": "user", "content": user_template.format(context=context, question=question)}
82
  ]
83
 
84
+ # Generate response
85
  stream = client.chat.completions.create(
86
  model="mistralai/Mistral-7B-Instruct-v0.2",
87
  messages=prompt,
 
92
 
93
  output = ""
94
  for chunk in stream:
95
+ output += chunk.choices[0].delta.content or ""
 
96
 
97
  except Exception as e:
98
  output = f"An internal error occurred while generating the response: {str(e)}"
99
 
100
+ # Log to file
101
  with scheduler.lock:
102
  with log_file.open("a") as f:
103
  f.write(json.dumps({
 
108
 
109
  return output
110
 
111
+ # Gradio UI
112
  demo = gr.Interface(
113
  fn=predict,
114
  inputs=gr.Textbox(label="Enter your legal question:", lines=4),
115
  outputs=gr.Textbox(label="Answer"),
116
  title="⚖️ GL_LegalMind",
117
+ description="Ask contract-related legal questions. Answers are based on retrieved clauses or inferred from CUAD knowledge."
118
  )
119
 
120
  demo.queue()