Spaces:

BarudakLearning
/

terTATa

Sleeping

App Files Files Community

paddle12 commited on Jul 10, 2025

Commit

2c4447e

verified ·

1 Parent(s): 2212ca0

Upload utils.py

Browse files

Files changed (1) hide show

utils.py +148 -0

utils.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import uuid
+import json
+from typing import List, Dict, Any, Tuple
+from openpyxl import load_workbook
+# === XLSX to JSON Conversion Functions ===
+def detect_table_and_paragraphs(worksheet) -> Tuple[List[List[str]], List[Dict[str, Any]]]:
+    """
+    Detect and separate table data and paragraph text from Excel worksheet.
+    Returns: (table_data, paragraphs)
+    """
+    data = []
+    max_col = worksheet.max_column
+    max_row = worksheet.max_row
+    # Read all data from worksheet
+    for row in worksheet.iter_rows(min_row=1, max_row=max_row, values_only=True):
+        # Filter out completely empty rows
+        if any(cell is not None for cell in row):
+            data.append(list(row))
+    # Detect table boundaries
+    table_data = []
+    paragraph_texts = []
+    table_ended = False
+    for i, row in enumerate(data):
+        # Count non-empty cells in the row
+        non_empty_cells = sum(1 for cell in row if cell is not None)
+        if not table_ended:
+            # If row has multiple non-empty cells, it's likely part of a table
+            if non_empty_cells >= 2:
+                # Convert None to empty strings for table cells
+                table_row = [str(cell) if cell is not None else "" for cell in row[:max_col]]
+                table_data.append(table_row)
+            else:
+                # Single cell with text might be end of table
+                if non_empty_cells == 1 and any(cell for cell in row if cell):
+                    # Check if it's a paragraph (longer text)
+                    text = next((str(cell) for cell in row if cell), "")
+                    if len(text) > 50 or not any(char.isdigit() for char in text):
+                        table_ended = True
+                        paragraph_texts.append(text)
+                elif non_empty_cells == 0 and len(table_data) > 0:
+                    # Empty row after table data
+                    table_ended = True
+        else:
+            # After table ended, collect paragraphs
+            if non_empty_cells > 0:
+                text = " ".join(str(cell) for cell in row if cell)
+                if text.strip():
+                    paragraph_texts.append(text.strip())
+    # Create paragraph objects
+    paragraphs = []
+    for i, text in enumerate(paragraph_texts):
+        paragraphs.append({
+            "uid": str(uuid.uuid4()),
+            "order": i + 1,
+            "text": text
+        })
+    return table_data, paragraphs
+def xlsx_to_json(file_path) -> Dict[str, Any]:
+    """
+    Convert XLSX file to TAT-QA JSON format.
+    """
+    workbook = load_workbook(file_path, data_only=True)
+    worksheet = workbook.active
+    # Extract table and paragraphs
+    table_data, paragraphs = detect_table_and_paragraphs(worksheet)
+    # Create JSON structure
+    json_data = {
+        "table": {
+            "uid": str(uuid.uuid4()),
+            "table": table_data
+        },
+        "paragraphs": paragraphs,
+        "questions": []  # Empty for user to fill later
+    }
+    return json_data
+def json_to_jsonl(json_data: Dict[str, Any]) -> str:
+    """
+    Convert JSON to JSONL format (one JSON object per line).
+    """
+    return json.dumps(json_data, ensure_ascii=False)
+def json_to_markdown(json_data: Dict[str, Any]) -> str:
+    """
+    Convert JSON data to markdown format for display.
+    """
+    markdown_content = "## Table Data\n\n"
+    # Convert table to markdown
+    table = json_data["table"]["table"]
+    if table:
+        # Create markdown table
+        markdown_content += "| " + " | ".join(table[0]) + " |\n"
+        markdown_content += "| " + " | ".join(["---"] * len(table[0])) + " |\n"
+        for row in table[1:]:
+            markdown_content += "| " + " | ".join(row) + " |\n"
+    # Add paragraphs
+    markdown_content += "\n## Context/Paragraphs\n\n"
+    for para in json_data["paragraphs"]:
+        markdown_content += f"{para['order']}. {para['text']}\n\n"
+    return markdown_content
+# === Updated Prompt Creation Function ===
+def create_prompt(table_data: Dict[str, Any], question: str) -> str:
+    """
+    Create prompt in the same format as training data.
+    """
+    # Convert table to markdown format
+    table = table_data["table"]["table"]
+    table_md = "\n".join(["| " + " | ".join(row) + " |" for row in table])
+    # Extract paragraph texts
+    text_content = "\n".join([p["text"] for p in table_data["paragraphs"]])
+    prompt = f"""### Instruction
+Given a table and a list of texts in the following, answer the question posed using the following six-step process:
+1. Step 1: Predict the type of question being asked. Store this prediction in the variable {{question_type}}.
+2. Step 2: Extract the relevant strings or numerical values from the provided table or texts. Store them in {{evidence}}.
+3. Step 3: If {{question_type}} is Arithmetic, generate an equation in {{equation}}. Otherwise, put N.A..
+4. Step 4: Compute the final answer and store in {{answer}}.
+5. Step 5: Predict the answer's scale in {{scale}}. One of: none, percent, thousand, million, billion.
+6. Step 6: Based on the {{answer}} and {{question_type}}, generate a short and logical recommendation, business insight, or next action. Store it in {{action}}.
+### Table
+{table_md}
+### Text
+{text_content}
+### Question
+{question}
+### Answer"""
+    return prompt