File size: 14,710 Bytes
f147852 83704ca f147852 83704ca e9aff27 f147852 83704ca f147852 83704ca f147852 83704ca f147852 83704ca f147852 83704ca f147852 83704ca f147852 83704ca f147852 e9aff27 83704ca f147852 83704ca f147852 83704ca f147852 83704ca f147852 83704ca f147852 e9aff27 83704ca e9aff27 83704ca e9aff27 83704ca e9aff27 83704ca e9aff27 83704ca e9aff27 83704ca e9aff27 83704ca f147852 83704ca f147852 83704ca f147852 83704ca f147852 e9aff27 f147852 83704ca f147852 e9aff27 83704ca f147852 e9aff27 d9b934b 83704ca f147852 d9b934b 83704ca d9b934b 83704ca d9b934b f147852 83704ca d9b934b 83704ca d9b934b 83704ca d9b934b 83704ca e9aff27 83704ca e9aff27 83704ca d9b934b 83704ca d9b934b e9aff27 d9b934b e9aff27 83704ca d9b934b 83704ca d9b934b 83704ca d9b934b 83704ca d9b934b 83704ca d9b934b f147852 83704ca e9aff27 83704ca e9aff27 83704ca e9aff27 83704ca e9aff27 83704ca e9aff27 83704ca e9aff27 83704ca e9aff27 f147852 e9aff27 83704ca f147852 83704ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 |
# ./tools/tools.py
import os
import json
import logging
import textwrap
import asyncio
import re
import httpx
import langextract as lx
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import google.generativeai as genai
# Step 1: Load environment variables and configure API keys
load_dotenv()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
try:
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
raise ValueError("GEMINI_API_KEY not found in environment variables.")
os.environ["LANGEXTRACT_API_KEY"] = api_key
genai.configure(api_key=api_key)
except ValueError as e:
logger.warning(f"API not configured. Tool will fail. Reason: {e}")
def extract_text_from_html(html_content: str) -> str:
"""
Parses an HTML string and extracts all human-readable text from the body.
"""
if not html_content:
return ""
soup = BeautifulSoup(html_content, "html.parser")
for script_or_style in soup(["script", "style"]):
script_or_style.decompose()
text = soup.get_text(separator=" ", strip=True)
return text
async def _pre_clean_text_with_gemini(messy_text: str) -> str:
"""
Takes messy OCR text and uses Gemini to clean it into a coherent document.
"""
model = genai.GenerativeModel(model_name="gemini-2.5-flash")
prompt = textwrap.dedent(
f"""
The following text is from a messy OCR process. It contains extra spaces, incorrect line breaks, and jumbled words.
Your task is to clean and reformat it into a single, coherent block of text that reads like a proper document.
Do not summarize or change the content. Just fix the formatting and structure.
Return ONLY the cleaned text, with no explanations.
**Messy Text:**
---
{messy_text}
---
"""
)
try:
response = await model.generate_content_async(prompt)
return response.text.strip()
except Exception as e:
logger.error(f"Error during text pre-cleaning: {e}")
return messy_text
async def _translate_text_to_english_with_sealion(text: str) -> str:
"""
Translates the given text to English using the Sea-Lion model.
"""
url = "https://api.sea-lion.ai/v1/chat/completions"
api_key = os.getenv("SEALION_API_KEY")
if not api_key:
logger.warning("SEALION_API_KEY not found. Skipping translation.")
return text
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
prompt = f'Translate the following text to English. Return ONLY the translated text, without any additional explanations, formatting, or quotation marks:\n\n"{text}"'
payload = {
"max_completion_tokens": 4096,
"messages": [{"role": "user", "content": prompt}],
"model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
}
async with httpx.AsyncClient() as client:
try:
response = await client.post(
url, headers=headers, json=payload, timeout=60.0
)
response.raise_for_status()
response_json = response.json()
translated_text = response_json["choices"][0]["message"]["content"].strip()
return re.sub(r'^"|"$', "", translated_text)
except httpx.RequestError as e:
logger.error(f"Translation request to Sea-Lion failed: {e}")
return text
except (KeyError, IndexError) as e:
logger.error(f"Could not parse Sea-Lion translation response: {e}")
return text
async def _generate_html_summary(extracted_data: dict) -> str:
"""
Takes the structured data and generates a clean, user-friendly HTML summary sheet in English.
"""
model = genai.GenerativeModel(model_name="gemini-2.5-flash")
prompt_data = json.dumps(extracted_data, indent=2, ensure_ascii=False)
prompt = textwrap.dedent(
f"""
You are a web designer creating a one-page summary sheet.
Your task is to convert the following JSON data into a simple, clean, and easy-to-read HTML document.
The entire document MUST be in English.
**JSON Data:**
```json
{prompt_data}
```
**Instructions:**
1. Use a single HTML file structure. Include modern, clean CSS in a `<style>` tag.
2. Create a main container and use a card-based layout.
3. Use clear headings (e.g., `<h2>`, `<h3>`) for each section.
4. Display the `summary` for each clause prominently.
5. The final output must ONLY be the raw HTML code.
"""
)
try:
response = await model.generate_content_async(prompt)
html_match = re.search(r"```html\n(.*?)\n```", response.text, re.DOTALL)
if html_match:
return html_match.group(1).strip()
return response.text.strip()
except Exception as e:
logger.error(f"Error generating HTML summary: {e}")
return f"<html><body><h1>Error</h1><p>Could not generate the final summary sheet.</p><p>{str(e)}</p></body></html>"
async def analyze_contract(html_content: str) -> dict:
"""
Analyzes a contract by cleaning, translating, extracting data, and generating a summary.
"""
messy_document_text = extract_text_from_html(html_content)
if not messy_document_text.strip():
return {
"error": "Could not extract any meaningful text from the provided HTML content."
}
logger.info("Stage 1: Pre-cleaning raw text...")
cleaned_document_text = await _pre_clean_text_with_gemini(messy_document_text)
logger.info("Stage 1: Pre-cleaning complete.")
logger.info("Stage 2: Translating text to English with Sea-Lion...")
english_document_text = await _translate_text_to_english_with_sealion(
cleaned_document_text
)
logger.info("Stage 2: Translation complete.")
# --- START: IMPROVED PROMPT AND EXAMPLES ---
prompt = textwrap.dedent(
"""
You are a meticulous data extraction system specializing in payslips and employment contracts.
Your task is to extract specific entities from the provided English text. Follow these rules precisely:
**Extraction Rules:**
1. **Extract Exact Text:** The `extraction_text` must be the exact text from the document representing the entity's value, without including the label (e.g., for "Basic Pay: $2000", extract "$2000", not the whole phrase).
2. **Do Not Overlap:** Entities must not overlap.
3. **Be Comprehensive:** Extract all occurrences of each entity type. For example, if there are multiple bonuses or deductions, extract each one as a separate entity.
4. **No Inference:** If an entity is not explicitly mentioned, do not extract anything for it. Do not invent information.
**Entities to Extract:**
- `employer`: The name of the company or employer.
- `employee`: The name of the employee.
- `pay_period`: The specific date range for the payslip (e.g., "September 1, 2021 to September 30, 2021").
- `salary`: The primary or base salary amount.
- `deductions`: Any amount subtracted from the pay.
- `bonus`: Any additional payments like bonuses, allowances, or overtime pay.
**Attribute Generation:**
- For every extraction, you MUST generate a `summary` attribute.
- The summary should be a complete, simple English sentence describing the extracted entity. For example: "The employer is ABC PTE LTD." or "The base salary is $2000."
"""
)
examples = [
# Example 1: Clean, standard key-value format
lx.data.ExampleData(
text="Payslip for September 1, 2021 - September 30, 2021. Company: ABC PTE LTD. Staff: Tan Ah Kow. Basic Pay: $2000. Annual Bonus: $2000.",
extractions=[
lx.data.Extraction(
extraction_class="pay_period",
extraction_text="September 1, 2021 - September 30, 2021",
attributes={
"summary": "The pay period is from September 1, 2021 to September 30, 2021."
},
),
lx.data.Extraction(
extraction_class="employer",
extraction_text="ABC PTE LTD",
attributes={"summary": "The employer is ABC PTE LTD."},
),
lx.data.Extraction(
extraction_class="employee",
extraction_text="Tan Ah Kow",
attributes={"summary": "The employee's name is Tan Ah Kow."},
),
lx.data.Extraction(
extraction_class="salary",
extraction_text="$2000",
attributes={"summary": "The base salary is $2000."},
),
lx.data.Extraction(
extraction_class="bonus",
extraction_text="$2000",
attributes={"summary": "An annual bonus of $2000 was paid."},
),
],
),
# Example 2: Messier, tabular-style text without clear key-value pairs
lx.data.ExampleData(
text="Employer Name ABC Global Services Period of Pay 01/10/2022 to 31/10/2022 Employee John Doe Earnings Base Salary 3,500.00 Transport Allowance 150.00 Deductions CPF Contribution 700.00",
extractions=[
lx.data.Extraction(
extraction_class="employer",
extraction_text="ABC Global Services",
attributes={"summary": "The employer is ABC Global Services."},
),
lx.data.Extraction(
extraction_class="pay_period",
extraction_text="01/10/2022 to 31/10/2022",
attributes={
"summary": "The pay period is from 01/10/2022 to 31/10/2022."
},
),
lx.data.Extraction(
extraction_class="employee",
extraction_text="John Doe",
attributes={"summary": "The employee's name is John Doe."},
),
lx.data.Extraction(
extraction_class="salary",
extraction_text="3,500.00",
attributes={"summary": "The base salary is 3,500.00."},
),
lx.data.Extraction(
extraction_class="bonus",
extraction_text="150.00",
attributes={
"summary": "A transport allowance of 150.00 was provided."
},
),
lx.data.Extraction(
extraction_class="deductions",
extraction_text="700.00",
attributes={"summary": "A CPF deduction of 700.00 was made."},
),
],
),
# Example 3: Multiple entries for one class, and a missing class
lx.data.ExampleData(
text="Payslip for Jane Smith at Innovate Corp. For the month of November 2023. Salary: 4000 SGD. Deductions include a loan payment of 200 and a charity donation of 50. No bonus was issued.",
extractions=[
lx.data.Extraction(
extraction_class="employee",
extraction_text="Jane Smith",
attributes={"summary": "The employee's name is Jane Smith."},
),
lx.data.Extraction(
extraction_class="employer",
extraction_text="Innovate Corp",
attributes={"summary": "The employer is Innovate Corp."},
),
lx.data.Extraction(
extraction_class="pay_period",
extraction_text="November 2023",
attributes={
"summary": "The pay period is for the month of November 2023."
},
),
lx.data.Extraction(
extraction_class="salary",
extraction_text="4000 SGD",
attributes={"summary": "The salary is 4000 SGD."},
),
lx.data.Extraction(
extraction_class="deductions",
extraction_text="200",
attributes={"summary": "A loan payment deduction of 200 was made."},
),
lx.data.Extraction(
extraction_class="deductions",
extraction_text="50",
attributes={
"summary": "A charity donation deduction of 50 was made."
},
),
],
),
]
# --- END: IMPROVED PROMPT AND EXAMPLES ---
try:
logger.info("Stage 3: Starting structured data extraction from English text...")
annotated_document = await asyncio.to_thread(
lx.extract,
text_or_documents=english_document_text,
prompt_description=prompt,
examples=examples,
model_id="gemini-2.5-flash",
)
logger.info("Stage 3: Extraction complete.")
extracted_data = {}
debug_visualization_html = lx.visualize(annotated_document)
for extr in annotated_document.extractions:
if extr.attributes:
class_key = extr.extraction_class.replace(" ", "_")
if class_key not in extracted_data:
extracted_data[class_key] = []
extracted_data[class_key].append(
{
"text": extr.extraction_text,
"summary": extr.attributes.get(
"summary", "No summary provided."
),
}
)
logger.info("Stage 4: Generating final HTML summary sheet...")
summary_sheet_html = await _generate_html_summary(extracted_data)
logger.info("Stage 4: HTML summary sheet generated.")
return {
"language": "en",
"extracted_data": extracted_data,
"summary_sheet_html": summary_sheet_html,
"debug_visualization_html": debug_visualization_html,
}
except Exception as e:
logger.error(f"An error occurred during contract analysis: {e}", exc_info=True)
return {"error": f"An unexpected error occurred: {str(e)}"}
|