Spaces:

kevansoon
/

backend

Sleeping

App Files Files Community

KevanSoon commited on Aug 14, 2025

Commit

e9aff27

1 Parent(s): 4082001

added rahul tools.py

Browse files

Files changed (1) hide show

tools/tools.py +87 -48

tools/tools.py CHANGED Viewed

@@ -6,6 +6,7 @@ import logging
 import textwrap
 import asyncio
 import re
 import langextract as lx
 from bs4 import BeautifulSoup
@@ -44,7 +45,7 @@ async def _pre_clean_text_with_gemini(messy_text: str) -> str:
     """
     Takes messy OCR text and uses Gemini to clean it into a coherent document.
     """
-    model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest")
     prompt = textwrap.dedent(
         f"""
         The following text is from a messy OCR process. It contains extra spaces, incorrect line breaks, and jumbled words.
@@ -66,17 +67,56 @@ async def _pre_clean_text_with_gemini(messy_text: str) -> str:
         return messy_text
-async def _generate_html_summary(extracted_data: dict, language_code: str) -> str:
     """
-    Takes the structured data and generates a clean, user-friendly HTML summary sheet.
     """
-    model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest")
     prompt_data = json.dumps(extracted_data, indent=2, ensure_ascii=False)
     prompt = textwrap.dedent(
         f"""
-        You are a web designer creating a one-page summary sheet for a migrant worker.
         Your task is to convert the following JSON data into a simple, clean, and easy-to-read HTML document.
-        The entire document MUST be in the language corresponding to the code: '{language_code}'.
         **JSON Data:**
         ```json
@@ -85,10 +125,10 @@ async def _generate_html_summary(extracted_data: dict, language_code: str) -> st
         **Instructions:**
         1.  Use a single HTML file structure. Include modern, clean CSS in a `<style>` tag.
-        2.  Create a main container and use a card-based layout. Each key piece of information should be in its own styled `div`.
-        3.  Use clear headings (e.g., `<h2>`, `<h3>`) for each section, in the target language.
         4.  Display the `summary` for each clause prominently.
-        5.  The final output must ONLY be the raw HTML code. Do not add comments or markdown backticks.
     """
     )
     try:
@@ -104,8 +144,7 @@ async def _generate_html_summary(extracted_data: dict, language_code: str) -> st
 async def analyze_contract(html_content: str) -> dict:
     """
-    Analyzes a contract by pre-cleaning the text, extracting structured data,
-    and then generating a clean HTML summary sheet.
     """
     messy_document_text = extract_text_from_html(html_content)
     if not messy_document_text.strip():
@@ -113,14 +152,19 @@ async def analyze_contract(html_content: str) -> dict:
             "error": "Could not extract any meaningful text from the provided HTML content."
         }
-    logger.info("Stage 1: Pre-cleaning raw OCR text...")
     cleaned_document_text = await _pre_clean_text_with_gemini(messy_document_text)
     logger.info("Stage 1: Pre-cleaning complete.")
     prompt = textwrap.dedent(
         """
-        You are an expert in labor laws. From the provided text, extract the following entities.
-        - `document_meta`: Extract the first word and add a 'language_code' attribute (e.g., 'en', 'zh', 'ms').
         - `employer`: The name of the employer.
         - `employee`: The name of the employee.
         - `pay_period`: The date range for the payment.
@@ -128,83 +172,78 @@ async def analyze_contract(html_content: str) -> dict:
         - `deductions`: Any deductions from the pay.
         - `bonus`: Any bonus payments.
-        For each entity, add a `summary` attribute written in the **detected language**, explaining it in simple terms.
         """
     )
     examples = [
         lx.data.ExampleData(
-            text="明细的 付款 滑 名称 的 雇主 ABC PTE 有限公司 用于 的 时期： 2021年9月1日 - 2021年9月30日 名称 的 员工 吨 啊 Kow 基础 支付 2000美元 阿内 奖金 2000美元",
             extractions=[
-                lx.data.Extraction(
-                    extraction_class="document_meta",
-                    extraction_text="明细的",
-                    attributes={"language_code": "zh"},
-                ),
                 lx.data.Extraction(
                     extraction_class="employer",
-                    extraction_text="ABC PTE 有限公司",
-                    attributes={"summary": "雇主是 ABC PTE 有限公司。"},
                 ),
                 lx.data.Extraction(
                     extraction_class="employee",
-                    extraction_text="吨 啊 Kow",
-                    attributes={"summary": "员工姓名是 吨 啊 Kow。"},
                 ),
                 lx.data.Extraction(
                     extraction_class="pay_period",
-                    extraction_text="2021年9月1日 - 2021年9月30日",
-                    attributes={"summary": "支付周期为2021年9月1日至30日。"},
                 ),
                 lx.data.Extraction(
                     extraction_class="salary",
-                    extraction_text="基础 支付 2000美元",
-                    attributes={"summary": "基本工资是 2000美元。"},
                 ),
                 lx.data.Extraction(
                     extraction_class="bonus",
-                    extraction_text="阿内 奖金 2000美元",
-                    attributes={"summary": "奖金是 2000美元。"},
                 ),
             ],
         )
     ]
     try:
-        logger.info("Stage 2: Starting structured data extraction from cleaned text...")
         annotated_document = await asyncio.to_thread(
             lx.extract,
-            text_or_documents=cleaned_document_text,
             prompt_description=prompt,
             examples=examples,
-            model_id="gemini-1.5-flash-latest",
         )
-        logger.info("Stage 2: Extraction complete.")
-        language = "unknown"
         extracted_data = {}
         debug_visualization_html = lx.visualize(annotated_document)
         for extr in annotated_document.extractions:
-            if extr.extraction_class == "document_meta":
-                # --- THIS IS THE FIX ---
-                # Add a safety check to ensure attributes is not None before accessing it.
-                if extr.attributes:
-                    language = extr.attributes.get("language_code", "unknown")
-            else:
-                if extr.attributes:  # Also add a check here for safety
-                    extracted_data[extr.extraction_class] = {
                         "text": extr.extraction_text,
                         "summary": extr.attributes.get(
                             "summary", "No summary provided."
                         ),
                     }
-        logger.info("Stage 3: Generating final HTML summary sheet...")
-        summary_sheet_html = await _generate_html_summary(extracted_data, language)
-        logger.info("Stage 3: HTML summary sheet generated.")
         return {
-            "language": language,
             "extracted_data": extracted_data,
             "summary_sheet_html": summary_sheet_html,
             "debug_visualization_html": debug_visualization_html,

 import textwrap
 import asyncio
 import re
+import httpx
 import langextract as lx
 from bs4 import BeautifulSoup
     """
     Takes messy OCR text and uses Gemini to clean it into a coherent document.
     """
+    model = genai.GenerativeModel(model_name="gemini-2.5-flash")
     prompt = textwrap.dedent(
         f"""
         The following text is from a messy OCR process. It contains extra spaces, incorrect line breaks, and jumbled words.
         return messy_text
+async def _translate_text_to_english_with_sealion(text: str) -> str:
     """
+    Translates the given text to English using the Sea-Lion model.
     """
+    url = "https://api.sea-lion.ai/v1/chat/completions"
+    api_key = os.getenv("SEALION_API_KEY")
+    if not api_key:
+        logger.warning("SEALION_API_KEY not found. Skipping translation.")
+        return text
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    prompt = f'Translate the following text to English. Return ONLY the translated text, without any additional explanations, formatting, or quotation marks:\n\n"{text}"'
+    payload = {
+        "max_completion_tokens": 4096,
+        "messages": [{"role": "user", "content": prompt}],
+        "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
+    }
+    async with httpx.AsyncClient() as client:
+        try:
+            response = await client.post(
+                url, headers=headers, json=payload, timeout=60.0
+            )
+            response.raise_for_status()
+            response_json = response.json()
+            translated_text = response_json["choices"][0]["message"]["content"].strip()
+            return re.sub(r'^"|"$', "", translated_text)
+        except httpx.RequestError as e:
+            logger.error(f"Translation request to Sea-Lion failed: {e}")
+            return text
+        except (KeyError, IndexError) as e:
+            logger.error(f"Could not parse Sea-Lion translation response: {e}")
+            return text
+async def _generate_html_summary(extracted_data: dict) -> str:
+    """
+    Takes the structured data and generates a clean, user-friendly HTML summary sheet in English.
+    """
+    model = genai.GenerativeModel(model_name="gemini-2.5-flash")
     prompt_data = json.dumps(extracted_data, indent=2, ensure_ascii=False)
     prompt = textwrap.dedent(
         f"""
+        You are a web designer creating a one-page summary sheet.
         Your task is to convert the following JSON data into a simple, clean, and easy-to-read HTML document.
+        The entire document MUST be in English.
         **JSON Data:**
         ```json
         **Instructions:**
         1.  Use a single HTML file structure. Include modern, clean CSS in a `<style>` tag.
+        2.  Create a main container and use a card-based layout.
+        3.  Use clear headings (e.g., `<h2>`, `<h3>`) for each section.
         4.  Display the `summary` for each clause prominently.
+        5.  The final output must ONLY be the raw HTML code.
     """
     )
     try:
 async def analyze_contract(html_content: str) -> dict:
     """
+    Analyzes a contract by cleaning, translating, extracting data, and generating a summary.
     """
     messy_document_text = extract_text_from_html(html_content)
     if not messy_document_text.strip():
             "error": "Could not extract any meaningful text from the provided HTML content."
         }
+    logger.info("Stage 1: Pre-cleaning raw text...")
     cleaned_document_text = await _pre_clean_text_with_gemini(messy_document_text)
     logger.info("Stage 1: Pre-cleaning complete.")
+    logger.info("Stage 2: Translating text to English with Sea-Lion...")
+    english_document_text = await _translate_text_to_english_with_sealion(
+        cleaned_document_text
+    )
+    logger.info("Stage 2: Translation complete.")
     prompt = textwrap.dedent(
         """
+        You are an expert in labor laws. From the provided English text, extract the following entities.
         - `employer`: The name of the employer.
         - `employee`: The name of the employee.
         - `pay_period`: The date range for the payment.
         - `deductions`: Any deductions from the pay.
         - `bonus`: Any bonus payments.
+        For each entity, add a `summary` attribute written in simple English.
         """
     )
     examples = [
         lx.data.ExampleData(
+            text="Payslip for the period: September 1, 2021 - September 30, 2021. Employer's Name: ABC PTE LTD. Employee's Name: Tan Ah Kow. Basic Pay: $2000. Annual Bonus: $2000.",
             extractions=[
                 lx.data.Extraction(
                     extraction_class="employer",
+                    extraction_text="ABC PTE LTD",
+                    attributes={"summary": "The employer is ABC PTE LTD."},
                 ),
                 lx.data.Extraction(
                     extraction_class="employee",
+                    extraction_text="Tan Ah Kow",
+                    attributes={"summary": "The employee's name is Tan Ah Kow."},
                 ),
                 lx.data.Extraction(
                     extraction_class="pay_period",
+                    extraction_text="September 1, 2021 - September 30, 2021",
+                    attributes={
+                        "summary": "The pay period is from September 1, 2021 to September 30, 2021."
+                    },
                 ),
                 lx.data.Extraction(
                     extraction_class="salary",
+                    extraction_text="Basic Pay: $2000",
+                    attributes={"summary": "The base salary is $2000."},
                 ),
                 lx.data.Extraction(
                     extraction_class="bonus",
+                    extraction_text="Annual Bonus: $2000",
+                    attributes={"summary": "The annual bonus is $2000."},
                 ),
             ],
         )
     ]
     try:
+        logger.info("Stage 3: Starting structured data extraction from English text...")
         annotated_document = await asyncio.to_thread(
             lx.extract,
+            text_or_documents=english_document_text,
             prompt_description=prompt,
             examples=examples,
+            model_id="gemini-2.5-flash",
         )
+        logger.info("Stage 3: Extraction complete.")
         extracted_data = {}
         debug_visualization_html = lx.visualize(annotated_document)
         for extr in annotated_document.extractions:
+            if extr.attributes:
+                class_key = extr.extraction_class.replace(" ", "_")
+                if class_key not in extracted_data:
+                    extracted_data[class_key] = []
+                extracted_data[class_key].append(
+                    {
                         "text": extr.extraction_text,
                         "summary": extr.attributes.get(
                             "summary", "No summary provided."
                         ),
                     }
+                )
+        logger.info("Stage 4: Generating final HTML summary sheet...")
+        summary_sheet_html = await _generate_html_summary(extracted_data)
+        logger.info("Stage 4: HTML summary sheet generated.")
         return {
+            "language": "en",
             "extracted_data": extracted_data,
             "summary_sheet_html": summary_sheet_html,
             "debug_visualization_html": debug_visualization_html,