File size: 14,710 Bytes
f147852
 
 
 
 
83704ca
f147852
83704ca
e9aff27
f147852
83704ca
f147852
 
83704ca
f147852
83704ca
f147852
 
 
 
 
 
 
 
83704ca
f147852
83704ca
 
f147852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83704ca
f147852
83704ca
f147852
e9aff27
83704ca
 
 
 
 
 
 
 
f147852
83704ca
f147852
83704ca
 
 
f147852
83704ca
f147852
83704ca
 
f147852
 
e9aff27
83704ca
e9aff27
83704ca
e9aff27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83704ca
 
 
e9aff27
83704ca
e9aff27
83704ca
 
 
 
 
 
 
 
e9aff27
 
83704ca
e9aff27
83704ca
 
f147852
83704ca
 
 
 
 
f147852
83704ca
 
f147852
 
83704ca
f147852
e9aff27
f147852
83704ca
 
 
 
 
f147852
e9aff27
83704ca
 
f147852
e9aff27
 
 
 
 
 
d9b934b
83704ca
f147852
d9b934b
 
 
 
 
 
 
 
 
 
 
83704ca
d9b934b
 
 
 
83704ca
d9b934b
 
 
f147852
83704ca
 
d9b934b
83704ca
d9b934b
83704ca
d9b934b
 
 
 
 
 
 
83704ca
 
e9aff27
 
83704ca
 
 
e9aff27
 
83704ca
d9b934b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83704ca
 
d9b934b
e9aff27
d9b934b
e9aff27
83704ca
d9b934b
 
 
 
 
83704ca
 
d9b934b
 
83704ca
 
 
d9b934b
 
 
 
 
 
 
 
 
83704ca
 
d9b934b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83704ca
d9b934b
f147852
83704ca
e9aff27
83704ca
 
e9aff27
83704ca
 
e9aff27
83704ca
e9aff27
83704ca
 
 
 
 
e9aff27
 
 
 
 
 
83704ca
 
 
 
 
e9aff27
83704ca
e9aff27
 
 
f147852
 
e9aff27
83704ca
 
 
f147852
 
83704ca
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
# ./tools/tools.py

import os
import json
import logging
import textwrap
import asyncio
import re
import httpx

import langextract as lx
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import google.generativeai as genai

# Step 1: Load environment variables and configure API keys
load_dotenv()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("GEMINI_API_KEY not found in environment variables.")
    os.environ["LANGEXTRACT_API_KEY"] = api_key
    genai.configure(api_key=api_key)
except ValueError as e:
    logger.warning(f"API not configured. Tool will fail. Reason: {e}")


def extract_text_from_html(html_content: str) -> str:
    """
    Parses an HTML string and extracts all human-readable text from the body.
    """
    if not html_content:
        return ""
    soup = BeautifulSoup(html_content, "html.parser")
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()
    text = soup.get_text(separator=" ", strip=True)
    return text


async def _pre_clean_text_with_gemini(messy_text: str) -> str:
    """
    Takes messy OCR text and uses Gemini to clean it into a coherent document.
    """
    model = genai.GenerativeModel(model_name="gemini-2.5-flash")
    prompt = textwrap.dedent(
        f"""
        The following text is from a messy OCR process. It contains extra spaces, incorrect line breaks, and jumbled words.
        Your task is to clean and reformat it into a single, coherent block of text that reads like a proper document.
        Do not summarize or change the content. Just fix the formatting and structure.
        Return ONLY the cleaned text, with no explanations.

        **Messy Text:**
        ---
        {messy_text}
        ---
    """
    )
    try:
        response = await model.generate_content_async(prompt)
        return response.text.strip()
    except Exception as e:
        logger.error(f"Error during text pre-cleaning: {e}")
        return messy_text


async def _translate_text_to_english_with_sealion(text: str) -> str:
    """
    Translates the given text to English using the Sea-Lion model.
    """
    url = "https://api.sea-lion.ai/v1/chat/completions"
    api_key = os.getenv("SEALION_API_KEY")

    if not api_key:
        logger.warning("SEALION_API_KEY not found. Skipping translation.")
        return text

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }
    prompt = f'Translate the following text to English. Return ONLY the translated text, without any additional explanations, formatting, or quotation marks:\n\n"{text}"'
    payload = {
        "max_completion_tokens": 4096,
        "messages": [{"role": "user", "content": prompt}],
        "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
    }

    async with httpx.AsyncClient() as client:
        try:
            response = await client.post(
                url, headers=headers, json=payload, timeout=60.0
            )
            response.raise_for_status()
            response_json = response.json()
            translated_text = response_json["choices"][0]["message"]["content"].strip()
            return re.sub(r'^"|"$', "", translated_text)
        except httpx.RequestError as e:
            logger.error(f"Translation request to Sea-Lion failed: {e}")
            return text
        except (KeyError, IndexError) as e:
            logger.error(f"Could not parse Sea-Lion translation response: {e}")
            return text


async def _generate_html_summary(extracted_data: dict) -> str:
    """
    Takes the structured data and generates a clean, user-friendly HTML summary sheet in English.
    """
    model = genai.GenerativeModel(model_name="gemini-2.5-flash")
    prompt_data = json.dumps(extracted_data, indent=2, ensure_ascii=False)
    prompt = textwrap.dedent(
        f"""
        You are a web designer creating a one-page summary sheet.
        Your task is to convert the following JSON data into a simple, clean, and easy-to-read HTML document.
        The entire document MUST be in English.

        **JSON Data:**
        ```json
        {prompt_data}
        ```

        **Instructions:**
        1.  Use a single HTML file structure. Include modern, clean CSS in a `<style>` tag.
        2.  Create a main container and use a card-based layout.
        3.  Use clear headings (e.g., `<h2>`, `<h3>`) for each section.
        4.  Display the `summary` for each clause prominently.
        5.  The final output must ONLY be the raw HTML code.
    """
    )
    try:
        response = await model.generate_content_async(prompt)
        html_match = re.search(r"```html\n(.*?)\n```", response.text, re.DOTALL)
        if html_match:
            return html_match.group(1).strip()
        return response.text.strip()
    except Exception as e:
        logger.error(f"Error generating HTML summary: {e}")
        return f"<html><body><h1>Error</h1><p>Could not generate the final summary sheet.</p><p>{str(e)}</p></body></html>"


async def analyze_contract(html_content: str) -> dict:
    """
    Analyzes a contract by cleaning, translating, extracting data, and generating a summary.
    """
    messy_document_text = extract_text_from_html(html_content)
    if not messy_document_text.strip():
        return {
            "error": "Could not extract any meaningful text from the provided HTML content."
        }

    logger.info("Stage 1: Pre-cleaning raw text...")
    cleaned_document_text = await _pre_clean_text_with_gemini(messy_document_text)
    logger.info("Stage 1: Pre-cleaning complete.")

    logger.info("Stage 2: Translating text to English with Sea-Lion...")
    english_document_text = await _translate_text_to_english_with_sealion(
        cleaned_document_text
    )
    logger.info("Stage 2: Translation complete.")

    # --- START: IMPROVED PROMPT AND EXAMPLES ---
    prompt = textwrap.dedent(
        """
        You are a meticulous data extraction system specializing in payslips and employment contracts.
        Your task is to extract specific entities from the provided English text. Follow these rules precisely:

        **Extraction Rules:**
        1.  **Extract Exact Text:** The `extraction_text` must be the exact text from the document representing the entity's value, without including the label (e.g., for "Basic Pay: $2000", extract "$2000", not the whole phrase).
        2.  **Do Not Overlap:** Entities must not overlap.
        3.  **Be Comprehensive:** Extract all occurrences of each entity type. For example, if there are multiple bonuses or deductions, extract each one as a separate entity.
        4.  **No Inference:** If an entity is not explicitly mentioned, do not extract anything for it. Do not invent information.

        **Entities to Extract:**
        - `employer`: The name of the company or employer.
        - `employee`: The name of the employee.
        - `pay_period`: The specific date range for the payslip (e.g., "September 1, 2021 to September 30, 2021").
        - `salary`: The primary or base salary amount.
        - `deductions`: Any amount subtracted from the pay.
        - `bonus`: Any additional payments like bonuses, allowances, or overtime pay.

        **Attribute Generation:**
        - For every extraction, you MUST generate a `summary` attribute.
        - The summary should be a complete, simple English sentence describing the extracted entity. For example: "The employer is ABC PTE LTD." or "The base salary is $2000."
        """
    )
    examples = [
        # Example 1: Clean, standard key-value format
        lx.data.ExampleData(
            text="Payslip for September 1, 2021 - September 30, 2021. Company: ABC PTE LTD. Staff: Tan Ah Kow. Basic Pay: $2000. Annual Bonus: $2000.",
            extractions=[
                lx.data.Extraction(
                    extraction_class="pay_period",
                    extraction_text="September 1, 2021 - September 30, 2021",
                    attributes={
                        "summary": "The pay period is from September 1, 2021 to September 30, 2021."
                    },
                ),
                lx.data.Extraction(
                    extraction_class="employer",
                    extraction_text="ABC PTE LTD",
                    attributes={"summary": "The employer is ABC PTE LTD."},
                ),
                lx.data.Extraction(
                    extraction_class="employee",
                    extraction_text="Tan Ah Kow",
                    attributes={"summary": "The employee's name is Tan Ah Kow."},
                ),
                lx.data.Extraction(
                    extraction_class="salary",
                    extraction_text="$2000",
                    attributes={"summary": "The base salary is $2000."},
                ),
                lx.data.Extraction(
                    extraction_class="bonus",
                    extraction_text="$2000",
                    attributes={"summary": "An annual bonus of $2000 was paid."},
                ),
            ],
        ),
        # Example 2: Messier, tabular-style text without clear key-value pairs
        lx.data.ExampleData(
            text="Employer Name ABC Global Services Period of Pay 01/10/2022 to 31/10/2022 Employee John Doe Earnings Base Salary 3,500.00 Transport Allowance 150.00 Deductions CPF Contribution 700.00",
            extractions=[
                lx.data.Extraction(
                    extraction_class="employer",
                    extraction_text="ABC Global Services",
                    attributes={"summary": "The employer is ABC Global Services."},
                ),
                lx.data.Extraction(
                    extraction_class="pay_period",
                    extraction_text="01/10/2022 to 31/10/2022",
                    attributes={
                        "summary": "The pay period is from 01/10/2022 to 31/10/2022."
                    },
                ),
                lx.data.Extraction(
                    extraction_class="employee",
                    extraction_text="John Doe",
                    attributes={"summary": "The employee's name is John Doe."},
                ),
                lx.data.Extraction(
                    extraction_class="salary",
                    extraction_text="3,500.00",
                    attributes={"summary": "The base salary is 3,500.00."},
                ),
                lx.data.Extraction(
                    extraction_class="bonus",
                    extraction_text="150.00",
                    attributes={
                        "summary": "A transport allowance of 150.00 was provided."
                    },
                ),
                lx.data.Extraction(
                    extraction_class="deductions",
                    extraction_text="700.00",
                    attributes={"summary": "A CPF deduction of 700.00 was made."},
                ),
            ],
        ),
        # Example 3: Multiple entries for one class, and a missing class
        lx.data.ExampleData(
            text="Payslip for Jane Smith at Innovate Corp. For the month of November 2023. Salary: 4000 SGD. Deductions include a loan payment of 200 and a charity donation of 50. No bonus was issued.",
            extractions=[
                lx.data.Extraction(
                    extraction_class="employee",
                    extraction_text="Jane Smith",
                    attributes={"summary": "The employee's name is Jane Smith."},
                ),
                lx.data.Extraction(
                    extraction_class="employer",
                    extraction_text="Innovate Corp",
                    attributes={"summary": "The employer is Innovate Corp."},
                ),
                lx.data.Extraction(
                    extraction_class="pay_period",
                    extraction_text="November 2023",
                    attributes={
                        "summary": "The pay period is for the month of November 2023."
                    },
                ),
                lx.data.Extraction(
                    extraction_class="salary",
                    extraction_text="4000 SGD",
                    attributes={"summary": "The salary is 4000 SGD."},
                ),
                lx.data.Extraction(
                    extraction_class="deductions",
                    extraction_text="200",
                    attributes={"summary": "A loan payment deduction of 200 was made."},
                ),
                lx.data.Extraction(
                    extraction_class="deductions",
                    extraction_text="50",
                    attributes={
                        "summary": "A charity donation deduction of 50 was made."
                    },
                ),
            ],
        ),
    ]
    # --- END: IMPROVED PROMPT AND EXAMPLES ---

    try:
        logger.info("Stage 3: Starting structured data extraction from English text...")
        annotated_document = await asyncio.to_thread(
            lx.extract,
            text_or_documents=english_document_text,
            prompt_description=prompt,
            examples=examples,
            model_id="gemini-2.5-flash",
        )
        logger.info("Stage 3: Extraction complete.")

        extracted_data = {}
        debug_visualization_html = lx.visualize(annotated_document)

        for extr in annotated_document.extractions:
            if extr.attributes:
                class_key = extr.extraction_class.replace(" ", "_")
                if class_key not in extracted_data:
                    extracted_data[class_key] = []
                extracted_data[class_key].append(
                    {
                        "text": extr.extraction_text,
                        "summary": extr.attributes.get(
                            "summary", "No summary provided."
                        ),
                    }
                )

        logger.info("Stage 4: Generating final HTML summary sheet...")
        summary_sheet_html = await _generate_html_summary(extracted_data)
        logger.info("Stage 4: HTML summary sheet generated.")

        return {
            "language": "en",
            "extracted_data": extracted_data,
            "summary_sheet_html": summary_sheet_html,
            "debug_visualization_html": debug_visualization_html,
        }
    except Exception as e:
        logger.error(f"An error occurred during contract analysis: {e}", exc_info=True)
        return {"error": f"An unexpected error occurred: {str(e)}"}