KevanSoon commited on
Commit
e9aff27
·
1 Parent(s): 4082001

added rahul tools.py

Browse files
Files changed (1) hide show
  1. tools/tools.py +87 -48
tools/tools.py CHANGED
@@ -6,6 +6,7 @@ import logging
6
  import textwrap
7
  import asyncio
8
  import re
 
9
 
10
  import langextract as lx
11
  from bs4 import BeautifulSoup
@@ -44,7 +45,7 @@ async def _pre_clean_text_with_gemini(messy_text: str) -> str:
44
  """
45
  Takes messy OCR text and uses Gemini to clean it into a coherent document.
46
  """
47
- model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest")
48
  prompt = textwrap.dedent(
49
  f"""
50
  The following text is from a messy OCR process. It contains extra spaces, incorrect line breaks, and jumbled words.
@@ -66,17 +67,56 @@ async def _pre_clean_text_with_gemini(messy_text: str) -> str:
66
  return messy_text
67
 
68
 
69
- async def _generate_html_summary(extracted_data: dict, language_code: str) -> str:
70
  """
71
- Takes the structured data and generates a clean, user-friendly HTML summary sheet.
72
  """
73
- model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  prompt_data = json.dumps(extracted_data, indent=2, ensure_ascii=False)
75
  prompt = textwrap.dedent(
76
  f"""
77
- You are a web designer creating a one-page summary sheet for a migrant worker.
78
  Your task is to convert the following JSON data into a simple, clean, and easy-to-read HTML document.
79
- The entire document MUST be in the language corresponding to the code: '{language_code}'.
80
 
81
  **JSON Data:**
82
  ```json
@@ -85,10 +125,10 @@ async def _generate_html_summary(extracted_data: dict, language_code: str) -> st
85
 
86
  **Instructions:**
87
  1. Use a single HTML file structure. Include modern, clean CSS in a `<style>` tag.
88
- 2. Create a main container and use a card-based layout. Each key piece of information should be in its own styled `div`.
89
- 3. Use clear headings (e.g., `<h2>`, `<h3>`) for each section, in the target language.
90
  4. Display the `summary` for each clause prominently.
91
- 5. The final output must ONLY be the raw HTML code. Do not add comments or markdown backticks.
92
  """
93
  )
94
  try:
@@ -104,8 +144,7 @@ async def _generate_html_summary(extracted_data: dict, language_code: str) -> st
104
 
105
  async def analyze_contract(html_content: str) -> dict:
106
  """
107
- Analyzes a contract by pre-cleaning the text, extracting structured data,
108
- and then generating a clean HTML summary sheet.
109
  """
110
  messy_document_text = extract_text_from_html(html_content)
111
  if not messy_document_text.strip():
@@ -113,14 +152,19 @@ async def analyze_contract(html_content: str) -> dict:
113
  "error": "Could not extract any meaningful text from the provided HTML content."
114
  }
115
 
116
- logger.info("Stage 1: Pre-cleaning raw OCR text...")
117
  cleaned_document_text = await _pre_clean_text_with_gemini(messy_document_text)
118
  logger.info("Stage 1: Pre-cleaning complete.")
119
 
 
 
 
 
 
 
120
  prompt = textwrap.dedent(
121
  """
122
- You are an expert in labor laws. From the provided text, extract the following entities.
123
- - `document_meta`: Extract the first word and add a 'language_code' attribute (e.g., 'en', 'zh', 'ms').
124
  - `employer`: The name of the employer.
125
  - `employee`: The name of the employee.
126
  - `pay_period`: The date range for the payment.
@@ -128,83 +172,78 @@ async def analyze_contract(html_content: str) -> dict:
128
  - `deductions`: Any deductions from the pay.
129
  - `bonus`: Any bonus payments.
130
 
131
- For each entity, add a `summary` attribute written in the **detected language**, explaining it in simple terms.
132
  """
133
  )
134
  examples = [
135
  lx.data.ExampleData(
136
- text="明细的 付款 名称 雇主 ABC PTE 有限公司 用于 时期: 2021年9月1日 - 2021年9月30日 名称 员工 Kow 基础 支付 2000美元 阿内 奖金 2000美元",
137
  extractions=[
138
- lx.data.Extraction(
139
- extraction_class="document_meta",
140
- extraction_text="明细的",
141
- attributes={"language_code": "zh"},
142
- ),
143
  lx.data.Extraction(
144
  extraction_class="employer",
145
- extraction_text="ABC PTE 有限公司",
146
- attributes={"summary": "雇主是 ABC PTE 有限公司。"},
147
  ),
148
  lx.data.Extraction(
149
  extraction_class="employee",
150
- extraction_text=" Kow",
151
- attributes={"summary": "员工姓名是 Kow"},
152
  ),
153
  lx.data.Extraction(
154
  extraction_class="pay_period",
155
- extraction_text="2021年9月1 - 2021年9月30日",
156
- attributes={"summary": "支付周期为2021年9月1日至30日。"},
 
 
157
  ),
158
  lx.data.Extraction(
159
  extraction_class="salary",
160
- extraction_text="基础 支付 2000美元",
161
- attributes={"summary": "基本工资是 2000美元。"},
162
  ),
163
  lx.data.Extraction(
164
  extraction_class="bonus",
165
- extraction_text="阿内 奖金 2000美元",
166
- attributes={"summary": "奖金是 2000美元。"},
167
  ),
168
  ],
169
  )
170
  ]
171
 
172
  try:
173
- logger.info("Stage 2: Starting structured data extraction from cleaned text...")
174
  annotated_document = await asyncio.to_thread(
175
  lx.extract,
176
- text_or_documents=cleaned_document_text,
177
  prompt_description=prompt,
178
  examples=examples,
179
- model_id="gemini-1.5-flash-latest",
180
  )
181
- logger.info("Stage 2: Extraction complete.")
182
 
183
- language = "unknown"
184
  extracted_data = {}
185
  debug_visualization_html = lx.visualize(annotated_document)
186
 
187
  for extr in annotated_document.extractions:
188
- if extr.extraction_class == "document_meta":
189
- # --- THIS IS THE FIX ---
190
- # Add a safety check to ensure attributes is not None before accessing it.
191
- if extr.attributes:
192
- language = extr.attributes.get("language_code", "unknown")
193
- else:
194
- if extr.attributes: # Also add a check here for safety
195
- extracted_data[extr.extraction_class] = {
196
  "text": extr.extraction_text,
197
  "summary": extr.attributes.get(
198
  "summary", "No summary provided."
199
  ),
200
  }
 
201
 
202
- logger.info("Stage 3: Generating final HTML summary sheet...")
203
- summary_sheet_html = await _generate_html_summary(extracted_data, language)
204
- logger.info("Stage 3: HTML summary sheet generated.")
205
 
206
  return {
207
- "language": language,
208
  "extracted_data": extracted_data,
209
  "summary_sheet_html": summary_sheet_html,
210
  "debug_visualization_html": debug_visualization_html,
 
6
  import textwrap
7
  import asyncio
8
  import re
9
+ import httpx
10
 
11
  import langextract as lx
12
  from bs4 import BeautifulSoup
 
45
  """
46
  Takes messy OCR text and uses Gemini to clean it into a coherent document.
47
  """
48
+ model = genai.GenerativeModel(model_name="gemini-2.5-flash")
49
  prompt = textwrap.dedent(
50
  f"""
51
  The following text is from a messy OCR process. It contains extra spaces, incorrect line breaks, and jumbled words.
 
67
  return messy_text
68
 
69
 
70
+ async def _translate_text_to_english_with_sealion(text: str) -> str:
71
  """
72
+ Translates the given text to English using the Sea-Lion model.
73
  """
74
+ url = "https://api.sea-lion.ai/v1/chat/completions"
75
+ api_key = os.getenv("SEALION_API_KEY")
76
+
77
+ if not api_key:
78
+ logger.warning("SEALION_API_KEY not found. Skipping translation.")
79
+ return text
80
+
81
+ headers = {
82
+ "Authorization": f"Bearer {api_key}",
83
+ "Content-Type": "application/json",
84
+ }
85
+ prompt = f'Translate the following text to English. Return ONLY the translated text, without any additional explanations, formatting, or quotation marks:\n\n"{text}"'
86
+ payload = {
87
+ "max_completion_tokens": 4096,
88
+ "messages": [{"role": "user", "content": prompt}],
89
+ "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
90
+ }
91
+
92
+ async with httpx.AsyncClient() as client:
93
+ try:
94
+ response = await client.post(
95
+ url, headers=headers, json=payload, timeout=60.0
96
+ )
97
+ response.raise_for_status()
98
+ response_json = response.json()
99
+ translated_text = response_json["choices"][0]["message"]["content"].strip()
100
+ return re.sub(r'^"|"$', "", translated_text)
101
+ except httpx.RequestError as e:
102
+ logger.error(f"Translation request to Sea-Lion failed: {e}")
103
+ return text
104
+ except (KeyError, IndexError) as e:
105
+ logger.error(f"Could not parse Sea-Lion translation response: {e}")
106
+ return text
107
+
108
+
109
+ async def _generate_html_summary(extracted_data: dict) -> str:
110
+ """
111
+ Takes the structured data and generates a clean, user-friendly HTML summary sheet in English.
112
+ """
113
+ model = genai.GenerativeModel(model_name="gemini-2.5-flash")
114
  prompt_data = json.dumps(extracted_data, indent=2, ensure_ascii=False)
115
  prompt = textwrap.dedent(
116
  f"""
117
+ You are a web designer creating a one-page summary sheet.
118
  Your task is to convert the following JSON data into a simple, clean, and easy-to-read HTML document.
119
+ The entire document MUST be in English.
120
 
121
  **JSON Data:**
122
  ```json
 
125
 
126
  **Instructions:**
127
  1. Use a single HTML file structure. Include modern, clean CSS in a `<style>` tag.
128
+ 2. Create a main container and use a card-based layout.
129
+ 3. Use clear headings (e.g., `<h2>`, `<h3>`) for each section.
130
  4. Display the `summary` for each clause prominently.
131
+ 5. The final output must ONLY be the raw HTML code.
132
  """
133
  )
134
  try:
 
144
 
145
  async def analyze_contract(html_content: str) -> dict:
146
  """
147
+ Analyzes a contract by cleaning, translating, extracting data, and generating a summary.
 
148
  """
149
  messy_document_text = extract_text_from_html(html_content)
150
  if not messy_document_text.strip():
 
152
  "error": "Could not extract any meaningful text from the provided HTML content."
153
  }
154
 
155
+ logger.info("Stage 1: Pre-cleaning raw text...")
156
  cleaned_document_text = await _pre_clean_text_with_gemini(messy_document_text)
157
  logger.info("Stage 1: Pre-cleaning complete.")
158
 
159
+ logger.info("Stage 2: Translating text to English with Sea-Lion...")
160
+ english_document_text = await _translate_text_to_english_with_sealion(
161
+ cleaned_document_text
162
+ )
163
+ logger.info("Stage 2: Translation complete.")
164
+
165
  prompt = textwrap.dedent(
166
  """
167
+ You are an expert in labor laws. From the provided English text, extract the following entities.
 
168
  - `employer`: The name of the employer.
169
  - `employee`: The name of the employee.
170
  - `pay_period`: The date range for the payment.
 
172
  - `deductions`: Any deductions from the pay.
173
  - `bonus`: Any bonus payments.
174
 
175
+ For each entity, add a `summary` attribute written in simple English.
176
  """
177
  )
178
  examples = [
179
  lx.data.ExampleData(
180
+ text="Payslip for the period: September 1, 2021 - September 30, 2021. Employer's Name: ABC PTE LTD. Employee's Name: Tan Ah Kow. Basic Pay: $2000. Annual Bonus: $2000.",
181
  extractions=[
 
 
 
 
 
182
  lx.data.Extraction(
183
  extraction_class="employer",
184
+ extraction_text="ABC PTE LTD",
185
+ attributes={"summary": "The employer is ABC PTE LTD."},
186
  ),
187
  lx.data.Extraction(
188
  extraction_class="employee",
189
+ extraction_text="Tan Ah Kow",
190
+ attributes={"summary": "The employee's name is Tan Ah Kow."},
191
  ),
192
  lx.data.Extraction(
193
  extraction_class="pay_period",
194
+ extraction_text="September 1, 2021 - September 30, 2021",
195
+ attributes={
196
+ "summary": "The pay period is from September 1, 2021 to September 30, 2021."
197
+ },
198
  ),
199
  lx.data.Extraction(
200
  extraction_class="salary",
201
+ extraction_text="Basic Pay: $2000",
202
+ attributes={"summary": "The base salary is $2000."},
203
  ),
204
  lx.data.Extraction(
205
  extraction_class="bonus",
206
+ extraction_text="Annual Bonus: $2000",
207
+ attributes={"summary": "The annual bonus is $2000."},
208
  ),
209
  ],
210
  )
211
  ]
212
 
213
  try:
214
+ logger.info("Stage 3: Starting structured data extraction from English text...")
215
  annotated_document = await asyncio.to_thread(
216
  lx.extract,
217
+ text_or_documents=english_document_text,
218
  prompt_description=prompt,
219
  examples=examples,
220
+ model_id="gemini-2.5-flash",
221
  )
222
+ logger.info("Stage 3: Extraction complete.")
223
 
 
224
  extracted_data = {}
225
  debug_visualization_html = lx.visualize(annotated_document)
226
 
227
  for extr in annotated_document.extractions:
228
+ if extr.attributes:
229
+ class_key = extr.extraction_class.replace(" ", "_")
230
+ if class_key not in extracted_data:
231
+ extracted_data[class_key] = []
232
+ extracted_data[class_key].append(
233
+ {
 
 
234
  "text": extr.extraction_text,
235
  "summary": extr.attributes.get(
236
  "summary", "No summary provided."
237
  ),
238
  }
239
+ )
240
 
241
+ logger.info("Stage 4: Generating final HTML summary sheet...")
242
+ summary_sheet_html = await _generate_html_summary(extracted_data)
243
+ logger.info("Stage 4: HTML summary sheet generated.")
244
 
245
  return {
246
+ "language": "en",
247
  "extracted_data": extracted_data,
248
  "summary_sheet_html": summary_sheet_html,
249
  "debug_visualization_html": debug_visualization_html,