Spaces:

ricalanis
/

datasage-cleaning

Sleeping

App Files Files Community

ricalanis commited on Mar 7

Commit

84ca609

verified ·

1 Parent(s): 0de736f

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

Dockerfile +51 -0
README.md +29 -5
__init__.py +1 -0
client.py +62 -0
environments/__init__.py +1 -0
environments/shared/__init__.py +11 -0
environments/shared/domains.py +110 -0
environments/shared/enrichment_sources.py +363 -0
environments/shared/enterprise_data.py +242 -0
environments/shared/personas.py +105 -0
environments/shared/reward_utils.py +55 -0
models.py +37 -0
openenv.yaml +6 -0
pyproject.toml +28 -0
server/__init__.py +5 -0
server/app.py +48 -0
server/cleaning_environment.py +213 -0
server/requirements.txt +5 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,51 @@

+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=datasage_cleaning
+COPY . /app/env
+WORKDIR /app/env
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+FROM ${BASE_IMAGE}
+WORKDIR /app
+COPY --from=builder /app/env/.venv /app/.venv
+COPY --from=builder /app/env /app/env
+ENV PATH="/app/.venv/bin:$PATH"
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md CHANGED Viewed

@@ -1,10 +1,34 @@
 ---
-title: Datasage Cleaning
-emoji: 🦀
-colorFrom: red
-colorTo: red
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: DataSage Cleaning Environment
+emoji: 🧹
+colorFrom: green
+colorTo: blue
 sdk: docker
 pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
 ---
+# DataSage Cleaning Environment
+An RL environment for training agents to clean enterprise data across four domains (HR, Sales, Project Management, IT Operations).
+The agent receives a corrupted 50-row data batch and must apply cleaning operations (fill nulls, fix types, remove duplicates, standardize values, trim whitespace, correct typos) to maximise a composite data quality score. Episodes end when DQ > 0.95 or after 15 steps.
+## Quick Start
+```python
+from environments.cleaning.models import CleaningAction
+from environments.cleaning.client import CleaningEnv
+with CleaningEnv(base_url="http://localhost:8000") as env:
+    result = env.reset()
+    print(f"Domain: {result.observation.domain}, DQ: {result.observation.dq_score}")
+    result = env.step(CleaningAction(
+        operation="fill_null", column="Age", value="median"
+    ))
+    print(f"DQ after step: {result.observation.dq_score}")
+```

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """DataSage Cleaning Environment."""

client.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""DataSage Cleaning Environment Client."""
+from typing import Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from .models import CleaningAction, CleaningObservation
+class CleaningEnv(EnvClient[CleaningAction, CleaningObservation, State]):
+    """
+    Client for the DataSage Cleaning Environment.
+    Example:
+        >>> with CleaningEnv(base_url="http://localhost:8000") as client:
+        ...     result = client.reset()
+        ...     print(result.observation.dq_score)
+        ...     result = client.step(CleaningAction(
+        ...         operation="fill_null", column="Age", value="median"
+        ...     ))
+        ...     print(result.observation.dq_score)
+    """
+    def _step_payload(self, action: CleaningAction) -> Dict:
+        """Convert CleaningAction to JSON payload."""
+        return {
+            "operation": action.operation,
+            "column": action.column,
+            "value": action.value,
+            "params": action.params,
+        }
+    def _parse_result(self, payload: Dict) -> StepResult[CleaningObservation]:
+        """Parse server response into StepResult[CleaningObservation]."""
+        obs_data = payload.get("observation", {})
+        observation = CleaningObservation(
+            domain=obs_data.get("domain", ""),
+            data_preview=obs_data.get("data_preview", ""),
+            dq_report=obs_data.get("dq_report", ""),
+            dq_score=obs_data.get("dq_score", 0.0),
+            columns_info=obs_data.get("columns_info", ""),
+            step_number=obs_data.get("step_number", 0),
+            max_steps=obs_data.get("max_steps", 15),
+            done=payload.get("done", False),
+            reward=payload.get("reward"),
+            metadata=obs_data.get("metadata", {}),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict) -> State:
+        """Parse server response into State object."""
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

environments/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # DataSage environments package

environments/shared/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""Shared utilities for DataSage multi-domain enterprise environments."""
+from .domains import DOMAINS, DomainConfig
+from .personas import PERSONAS, Persona, score_persona_alignment
+from .reward_utils import cleaning_reward, enrichment_reward, answering_reward
+__all__ = [
+    "DOMAINS", "DomainConfig",
+    "PERSONAS", "Persona", "score_persona_alignment",
+    "cleaning_reward", "enrichment_reward", "answering_reward",
+]

environments/shared/domains.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""Domain registry for the 4 enterprise data domains."""
+from pydantic import BaseModel
+class DomainConfig(BaseModel):
+    name: str
+    display_name: str
+    dataset_key: str
+    columns: list[str]
+    numeric_columns: list[str]
+    categorical_columns: list[str]
+    possible_enrichments: list[str]
+    example_questions: list[str]
+DOMAINS = {
+    "hr": DomainConfig(
+        name="hr",
+        display_name="HR & People",
+        dataset_key="hr",
+        columns=[
+            "EmployeeID", "Age", "Department", "JobRole", "MonthlyIncome",
+            "YearsAtCompany", "Attrition", "JobSatisfaction", "OverTime",
+            "DistanceFromHome", "Education", "PerformanceRating",
+        ],
+        numeric_columns=["Age", "MonthlyIncome", "YearsAtCompany", "DistanceFromHome"],
+        categorical_columns=["Department", "JobRole", "Attrition", "OverTime"],
+        possible_enrichments=[
+            "salary_band", "tenure_risk", "satisfaction_index",
+            "industry_benchmark", "flight_risk_score",
+        ],
+        example_questions=[
+            "Which departments have the highest attrition rates?",
+            "What factors correlate most with employee turnover?",
+            "How does overtime affect job satisfaction?",
+            "What is the salary distribution across job roles?",
+            "Which employees are at highest flight risk?",
+        ],
+    ),
+    "sales": DomainConfig(
+        name="sales",
+        display_name="Sales & Revenue",
+        dataset_key="sales",
+        columns=[
+            "DealID", "AccountName", "Stage", "Amount", "CloseDate",
+            "Rep", "Product", "Region", "LeadSource", "DaysInStage",
+            "Probability", "ForecastCategory",
+        ],
+        numeric_columns=["Amount", "DaysInStage", "Probability"],
+        categorical_columns=["Stage", "Region", "Product", "ForecastCategory"],
+        possible_enrichments=[
+            "deal_size_category", "velocity_score", "win_probability_model",
+            "industry_code", "competitive_risk",
+        ],
+        example_questions=[
+            "What's our pipeline health for this quarter?",
+            "Which deals are at risk of slipping?",
+            "What's the average deal velocity by region?",
+            "Which reps are below quota?",
+            "What's the conversion rate by lead source?",
+        ],
+    ),
+    "pm": DomainConfig(
+        name="pm",
+        display_name="Project Management",
+        dataset_key="pm",
+        columns=[
+            "TaskID", "ProjectName", "Assignee", "Status", "Priority",
+            "DueDate", "EstimatedHours", "ActualHours", "Dependencies",
+            "Milestone", "RiskFlag", "CompletionPct",
+        ],
+        numeric_columns=["EstimatedHours", "ActualHours", "CompletionPct"],
+        categorical_columns=["Status", "Priority", "RiskFlag"],
+        possible_enrichments=[
+            "schedule_risk_score", "resource_utilization",
+            "dependency_chain_depth", "burndown_rate", "delay_probability",
+        ],
+        example_questions=[
+            "Which projects are at risk of missing deadlines?",
+            "How is resource utilization across teams?",
+            "What's the burndown rate for the current sprint?",
+            "Which tasks are blocking the most downstream work?",
+            "What's our on-time delivery rate?",
+        ],
+    ),
+    "it_ops": DomainConfig(
+        name="it_ops",
+        display_name="IT Operations",
+        dataset_key="it_ops",
+        columns=[
+            "TicketID", "Category", "Priority", "Status", "Assignee",
+            "CreatedDate", "ResolvedDate", "SLATarget", "EscalationLevel",
+            "AffectedSystem", "ResolutionType", "CustomerImpact",
+        ],
+        numeric_columns=["SLATarget", "EscalationLevel"],
+        categorical_columns=["Category", "Priority", "Status", "ResolutionType"],
+        possible_enrichments=[
+            "sla_compliance_flag", "mttr_band", "escalation_path",
+            "incident_severity_score", "recurring_pattern_flag",
+        ],
+        example_questions=[
+            "What's our SLA compliance rate this month?",
+            "Which systems have the most incidents?",
+            "What's the mean time to resolution trend?",
+            "How many tickets are breaching SLA?",
+            "What are the most common root causes?",
+        ],
+    ),
+}

environments/shared/enrichment_sources.py ADDED Viewed

	@@ -0,0 +1,363 @@

+"""Static enrichment lookup tables per domain (no API calls)."""
+import numpy as np
+# Enrichment registry: domain -> source -> lookup function or static data
+ENRICHMENT_REGISTRY: dict[str, dict[str, dict]] = {
+    "hr": {
+        "salary_band": {
+            "description": "BLS salary band classification based on monthly income",
+            "type": "derived",
+            "logic": "classify_salary_band",
+        },
+        "tenure_risk": {
+            "description": "Tenure-based flight risk score",
+            "type": "derived",
+            "logic": "compute_tenure_risk",
+        },
+        "satisfaction_index": {
+            "description": "Composite satisfaction index from multiple factors",
+            "type": "derived",
+            "logic": "compute_satisfaction_index",
+        },
+        "industry_benchmark": {
+            "description": "Industry benchmark salary percentile",
+            "type": "lookup",
+            "data": {
+                "Sales Executive": 65000, "Research Scientist": 72000,
+                "Manager": 85000, "Lab Technician": 45000,
+                "Manufacturing Director": 95000, "Healthcare Representative": 55000,
+                "Human Resources": 60000,
+            },
+        },
+        "flight_risk_score": {
+            "description": "Combined flight risk from satisfaction, tenure, overtime",
+            "type": "derived",
+            "logic": "compute_flight_risk",
+        },
+    },
+    "sales": {
+        "deal_size_category": {
+            "description": "Categorize deal by amount: Small/Medium/Large/Enterprise",
+            "type": "derived",
+            "logic": "classify_deal_size",
+        },
+        "velocity_score": {
+            "description": "Deal velocity based on days in stage vs benchmark",
+            "type": "derived",
+            "logic": "compute_velocity_score",
+        },
+        "win_probability_model": {
+            "description": "Heuristic win probability based on stage + days",
+            "type": "derived",
+            "logic": "compute_win_probability",
+        },
+        "industry_code": {
+            "description": "Industry classification code from account name patterns",
+            "type": "lookup",
+            "data": {
+                "Tech": "SIC-7372", "Healthcare": "SIC-8011",
+                "Finance": "SIC-6020", "Retail": "SIC-5311",
+                "Manufacturing": "SIC-3559", "default": "SIC-9999",
+            },
+        },
+        "competitive_risk": {
+            "description": "Competitive risk score based on deal stage and velocity",
+            "type": "derived",
+            "logic": "compute_competitive_risk",
+        },
+    },
+    "pm": {
+        "schedule_risk_score": {
+            "description": "Risk of schedule slippage based on progress vs due date",
+            "type": "derived",
+            "logic": "compute_schedule_risk",
+        },
+        "resource_utilization": {
+            "description": "Resource utilization ratio: actual/estimated hours",
+            "type": "derived",
+            "logic": "compute_resource_utilization",
+        },
+        "dependency_chain_depth": {
+            "description": "Depth of dependency chain for task",
+            "type": "derived",
+            "logic": "compute_dependency_depth",
+        },
+        "burndown_rate": {
+            "description": "Task completion rate relative to plan",
+            "type": "derived",
+            "logic": "compute_burndown_rate",
+        },
+        "delay_probability": {
+            "description": "Probability of delay based on current trajectory",
+            "type": "derived",
+            "logic": "compute_delay_probability",
+        },
+    },
+    "it_ops": {
+        "sla_compliance_flag": {
+            "description": "Whether ticket meets SLA target",
+            "type": "derived",
+            "logic": "compute_sla_compliance",
+        },
+        "mttr_band": {
+            "description": "Mean time to resolution band: Fast/Normal/Slow/Critical",
+            "type": "derived",
+            "logic": "classify_mttr",
+        },
+        "escalation_path": {
+            "description": "Recommended escalation path based on category and priority",
+            "type": "lookup",
+            "data": {
+                "P1-Critical": "L3 -> Manager -> VP",
+                "P2-High": "L2 -> L3 -> Manager",
+                "P3-Medium": "L1 -> L2",
+                "P4-Low": "L1",
+            },
+        },
+        "incident_severity_score": {
+            "description": "Computed severity score from priority and customer impact",
+            "type": "derived",
+            "logic": "compute_severity_score",
+        },
+        "recurring_pattern_flag": {
+            "description": "Flag indicating likely recurring issue",
+            "type": "derived",
+            "logic": "detect_recurring_pattern",
+        },
+    },
+}
+def lookup(domain: str, source: str, row: dict) -> object:
+    """Unified lookup/compute function for enrichment values."""
+    registry = ENRICHMENT_REGISTRY.get(domain, {})
+    source_config = registry.get(source)
+    if not source_config:
+        return None
+    if source_config["type"] == "lookup":
+        # Direct lookup from static data
+        data = source_config["data"]
+        # Try various keys from the row
+        for key_col in row:
+            val = str(row.get(key_col, ""))
+            if val in data:
+                return data[val]
+        return data.get("default")
+    # Derived computations
+    logic = source_config["logic"]
+    compute_fn = _COMPUTE_FUNCTIONS.get(logic)
+    if compute_fn:
+        return compute_fn(row)
+    return None
+# --- Computation functions ---
+def _classify_salary_band(row: dict) -> str:
+    try:
+        income = float(row.get("MonthlyIncome", 0))
+    except (ValueError, TypeError):
+        return "Unknown"
+    if income < 3000:
+        return "Entry"
+    elif income < 6000:
+        return "Mid"
+    elif income < 10000:
+        return "Senior"
+    return "Executive"
+def _compute_tenure_risk(row: dict) -> float:
+    try:
+        years = float(row.get("YearsAtCompany", 0))
+    except (ValueError, TypeError):
+        return 0.5
+    # Short tenure = higher risk, very long = moderate risk
+    if years < 2:
+        return 0.8
+    elif years < 5:
+        return 0.4
+    elif years < 10:
+        return 0.2
+    return 0.3
+def _compute_satisfaction_index(row: dict) -> float:
+    try:
+        satisfaction = float(row.get("JobSatisfaction", 3))
+    except (ValueError, TypeError):
+        satisfaction = 3
+    return round(satisfaction / 4.0, 2)
+def _compute_flight_risk(row: dict) -> float:
+    tenure_risk = _compute_tenure_risk(row)
+    sat_index = _compute_satisfaction_index(row)
+    overtime = 0.3 if str(row.get("OverTime", "No")).lower() == "yes" else 0.0
+    return round(0.4 * tenure_risk + 0.4 * (1 - sat_index) + 0.2 * overtime, 2)
+def _classify_deal_size(row: dict) -> str:
+    try:
+        amount = float(row.get("Amount", 0))
+    except (ValueError, TypeError):
+        return "Unknown"
+    if amount < 5000:
+        return "Small"
+    elif amount < 25000:
+        return "Medium"
+    elif amount < 100000:
+        return "Large"
+    return "Enterprise"
+def _compute_velocity_score(row: dict) -> float:
+    try:
+        days = float(row.get("DaysInStage", 0))
+    except (ValueError, TypeError):
+        return 0.5
+    # Benchmark: 30 days per stage
+    if days < 15:
+        return 1.0
+    elif days < 30:
+        return 0.7
+    elif days < 60:
+        return 0.4
+    return 0.1
+def _compute_win_probability(row: dict) -> float:
+    stage_probs = {
+        "Prospecting": 0.10, "Qualification": 0.25, "Proposal": 0.50,
+        "Negotiation": 0.75, "Won": 1.0, "Lost": 0.0,
+    }
+    stage = str(row.get("Stage", ""))
+    base_prob = stage_probs.get(stage, 0.3)
+    velocity = _compute_velocity_score(row)
+    return round(0.7 * base_prob + 0.3 * velocity, 2)
+def _compute_competitive_risk(row: dict) -> float:
+    velocity = _compute_velocity_score(row)
+    return round(1.0 - velocity, 2)
+def _compute_schedule_risk(row: dict) -> float:
+    try:
+        pct = float(row.get("CompletionPct", 0))
+    except (ValueError, TypeError):
+        pct = 0
+    # Simple: lower completion = higher risk
+    return round(1.0 - (pct / 100.0), 2)
+def _compute_resource_utilization(row: dict) -> float:
+    try:
+        estimated = float(row.get("EstimatedHours", 1))
+        actual = float(row.get("ActualHours", 0))
+    except (ValueError, TypeError):
+        return 0.0
+    if estimated == 0:
+        return 0.0
+    return round(actual / estimated, 2)
+def _compute_dependency_depth(row: dict) -> int:
+    deps = row.get("Dependencies", "")
+    if not deps or str(deps) in ("nan", "None", ""):
+        return 0
+    return len(str(deps).split(","))
+def _compute_burndown_rate(row: dict) -> float:
+    try:
+        pct = float(row.get("CompletionPct", 0))
+        estimated = float(row.get("EstimatedHours", 1))
+        actual = float(row.get("ActualHours", 0))
+    except (ValueError, TypeError):
+        return 0.5
+    if actual == 0:
+        return 0.0
+    expected_rate = pct / 100.0
+    time_rate = actual / max(estimated, 1)
+    return round(expected_rate / max(time_rate, 0.01), 2)
+def _compute_delay_probability(row: dict) -> float:
+    schedule_risk = _compute_schedule_risk(row)
+    burndown = _compute_burndown_rate(row)
+    return round(schedule_risk * (1.0 / max(burndown, 0.1)), 2)
+def _compute_sla_compliance(row: dict) -> str:
+    try:
+        sla = float(row.get("SLATarget", 24))
+        escalation = float(row.get("EscalationLevel", 0))
+    except (ValueError, TypeError):
+        return "Unknown"
+    if escalation > 2:
+        return "Breached"
+    return "Compliant"
+def _classify_mttr(row: dict) -> str:
+    try:
+        escalation = float(row.get("EscalationLevel", 0))
+    except (ValueError, TypeError):
+        return "Normal"
+    if escalation == 0:
+        return "Fast"
+    elif escalation <= 1:
+        return "Normal"
+    elif escalation <= 3:
+        return "Slow"
+    return "Critical"
+def _compute_severity_score(row: dict) -> float:
+    priority_scores = {"P1-Critical": 1.0, "P2-High": 0.7, "P3-Medium": 0.4, "P4-Low": 0.1}
+    priority = str(row.get("Priority", "P3-Medium"))
+    return priority_scores.get(priority, 0.4)
+def _detect_recurring_pattern(row: dict) -> bool:
+    category = str(row.get("Category", ""))
+    # Simple heuristic: certain categories tend to recur
+    recurring_cats = {"Network", "Email", "Access"}
+    return category in recurring_cats
+_COMPUTE_FUNCTIONS = {
+    "classify_salary_band": _classify_salary_band,
+    "compute_tenure_risk": _compute_tenure_risk,
+    "compute_satisfaction_index": _compute_satisfaction_index,
+    "compute_flight_risk": _compute_flight_risk,
+    "classify_deal_size": _classify_deal_size,
+    "compute_velocity_score": _compute_velocity_score,
+    "compute_win_probability": _compute_win_probability,
+    "compute_competitive_risk": _compute_competitive_risk,
+    "compute_schedule_risk": _compute_schedule_risk,
+    "compute_resource_utilization": _compute_resource_utilization,
+    "compute_dependency_depth": _compute_dependency_depth,
+    "compute_burndown_rate": _compute_burndown_rate,
+    "compute_delay_probability": _compute_delay_probability,
+    "compute_sla_compliance": _compute_sla_compliance,
+    "classify_mttr": _classify_mttr,
+    "compute_severity_score": _compute_severity_score,
+    "detect_recurring_pattern": _detect_recurring_pattern,
+}
+def get_available_enrichments(domain: str) -> list[str]:
+    """Return list of available enrichment source names for a domain."""
+    return list(ENRICHMENT_REGISTRY.get(domain, {}).keys())
+def get_enrichment_description(domain: str, source: str) -> str:
+    """Get human-readable description of an enrichment source."""
+    registry = ENRICHMENT_REGISTRY.get(domain, {})
+    config = registry.get(source, {})
+    return config.get("description", "Unknown enrichment source")

environments/shared/enterprise_data.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""Multi-domain dataset loading, corruption injection, and DQ scoring."""
+import random
+import string
+from typing import Optional
+import numpy as np
+import pandas as pd
+from .domains import DOMAINS, DomainConfig
+def load_domain_data(domain: str, sample_size: Optional[int] = None) -> pd.DataFrame:
+    """Load domain data from HF dataset or generate synthetic fallback."""
+    try:
+        from datasets import load_dataset
+        ds = load_dataset("ricalanis/datasage-enterprise-raw", domain, split="train")
+        df = ds.to_pandas()
+    except Exception:
+        df = _generate_synthetic(domain)
+    if sample_size and len(df) > sample_size:
+        df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
+    return df
+def _generate_synthetic(domain: str, n: int = 200) -> pd.DataFrame:
+    """Generate synthetic data as fallback when HF dataset unavailable."""
+    config = DOMAINS[domain]
+    rng = np.random.default_rng(42)
+    data = {}
+    for col in config.columns:
+        if col in config.numeric_columns:
+            data[col] = rng.normal(50, 20, n).round(2)
+        elif col in config.categorical_columns:
+            categories = _get_categories(domain, col)
+            data[col] = rng.choice(categories, n).tolist()
+        elif "ID" in col:
+            data[col] = [f"{col[:3].upper()}-{i:04d}" for i in range(n)]
+        elif "Date" in col:
+            base = pd.Timestamp("2024-01-01")
+            data[col] = [(base + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d")
+                         for d in rng.integers(0, 365, n)]
+        elif "Name" in col or "Assignee" in col or "Rep" in col:
+            names = ["Alice", "Bob", "Carol", "Dan", "Eve", "Frank", "Grace", "Hank"]
+            data[col] = rng.choice(names, n).tolist()
+        else:
+            data[col] = [f"{col}_val_{i}" for i in range(n)]
+    return pd.DataFrame(data)
+def _get_categories(domain: str, col: str) -> list[str]:
+    """Return realistic category values per domain and column."""
+    cat_map = {
+        "hr": {
+            "Department": ["Sales", "Research & Development", "Human Resources"],
+            "JobRole": ["Sales Executive", "Research Scientist", "Manager", "Lab Technician",
+                        "Manufacturing Director", "Healthcare Representative"],
+            "Attrition": ["Yes", "No"],
+            "OverTime": ["Yes", "No"],
+        },
+        "sales": {
+            "Stage": ["Prospecting", "Qualification", "Proposal", "Negotiation", "Won", "Lost"],
+            "Region": ["East", "West", "Central", "North", "South"],
+            "Product": ["GTX Pro", "GTX Basic", "GTX Plus", "MG Special", "MG Advanced"],
+            "ForecastCategory": ["Pipeline", "Best Case", "Commit", "Closed"],
+        },
+        "pm": {
+            "Status": ["Not Started", "In Progress", "Completed", "On Hold", "Cancelled"],
+            "Priority": ["Critical", "High", "Medium", "Low"],
+            "RiskFlag": ["High", "Medium", "Low", "None"],
+        },
+        "it_ops": {
+            "Category": ["Hardware", "Software", "Network", "Access", "Email"],
+            "Priority": ["P1-Critical", "P2-High", "P3-Medium", "P4-Low"],
+            "Status": ["Open", "In Progress", "Resolved", "Closed", "Pending"],
+            "ResolutionType": ["Fix Applied", "Workaround", "No Fix", "Duplicate", "User Error"],
+        },
+    }
+    return cat_map.get(domain, {}).get(col, ["A", "B", "C"])
+def inject_corruption(df: pd.DataFrame, domain_config: DomainConfig,
+                      rate: float = 0.15) -> pd.DataFrame:
+    """Inject realistic data quality issues into a DataFrame."""
+    corrupted = df.copy()
+    n_rows = len(corrupted)
+    rng = np.random.default_rng(42)
+    # 1. Inject nulls into numeric columns
+    for col in domain_config.numeric_columns:
+        if col in corrupted.columns:
+            null_mask = rng.random(n_rows) < rate
+            corrupted.loc[null_mask, col] = np.nan
+    # 2. Inject type mismatches (strings in numeric columns)
+    for col in domain_config.numeric_columns:
+        if col in corrupted.columns:
+            n_bad = max(1, int(n_rows * rate * 0.3))
+            bad_idx = rng.choice(n_rows, n_bad, replace=False)
+            corrupted[col] = corrupted[col].astype(object)
+            for idx in bad_idx:
+                corrupted.iloc[idx, corrupted.columns.get_loc(col)] = rng.choice(
+                    ["N/A", "unknown", "#REF!", "TBD", "-"]
+                )
+    # 3. Inject typos in categorical columns
+    for col in domain_config.categorical_columns:
+        if col in corrupted.columns:
+            n_typos = max(1, int(n_rows * rate * 0.2))
+            typo_idx = rng.choice(n_rows, n_typos, replace=False)
+            for idx in typo_idx:
+                val = str(corrupted.iloc[idx, corrupted.columns.get_loc(col)])
+                corrupted.iloc[idx, corrupted.columns.get_loc(col)] = _add_typo(val, rng)
+    # 4. Inject duplicates
+    n_dupes = max(1, int(n_rows * rate * 0.1))
+    dupe_idx = rng.choice(n_rows, n_dupes, replace=False)
+    dupes = corrupted.iloc[dupe_idx].copy()
+    corrupted = pd.concat([corrupted, dupes], ignore_index=True)
+    # 5. Inject whitespace issues
+    for col in domain_config.categorical_columns[:2]:
+        if col in corrupted.columns:
+            n_ws = max(1, int(n_rows * rate * 0.2))
+            ws_idx = rng.choice(len(corrupted), n_ws, replace=False)
+            for idx in ws_idx:
+                val = str(corrupted.iloc[idx, corrupted.columns.get_loc(col)])
+                corrupted.iloc[idx, corrupted.columns.get_loc(col)] = f"  {val}  "
+    return corrupted
+def _add_typo(text: str, rng: np.random.Generator) -> str:
+    """Add a realistic typo to a string."""
+    if len(text) < 2:
+        return text
+    typo_type = rng.choice(["swap", "delete", "insert", "case"])
+    idx = rng.integers(0, len(text))
+    if typo_type == "swap" and idx < len(text) - 1:
+        return text[:idx] + text[idx + 1] + text[idx] + text[idx + 2:]
+    elif typo_type == "delete":
+        return text[:idx] + text[idx + 1:]
+    elif typo_type == "insert":
+        char = rng.choice(list(string.ascii_lowercase))
+        return text[:idx] + char + text[idx:]
+    else:
+        return text[:idx] + text[idx].swapcase() + text[idx + 1:]
+def compute_dq_score(df: pd.DataFrame, domain_config: DomainConfig) -> dict:
+    """Compute data quality metrics: completeness, consistency, uniqueness, overall."""
+    available_cols = [c for c in domain_config.columns if c in df.columns]
+    # Completeness: 1 - (null ratio)
+    if available_cols:
+        null_ratio = df[available_cols].isnull().sum().sum() / (len(df) * len(available_cols))
+        completeness = 1.0 - null_ratio
+    else:
+        completeness = 1.0
+    # Consistency: check type correctness for numeric columns
+    consistency_scores = []
+    for col in domain_config.numeric_columns:
+        if col in df.columns:
+            valid = df[col].apply(lambda x: _is_numeric(x)).mean()
+            consistency_scores.append(valid)
+    consistency = float(np.mean(consistency_scores)) if consistency_scores else 1.0
+    # Uniqueness: 1 - (duplicate ratio)
+    if len(df) > 0:
+        n_dupes = df.duplicated(subset=available_cols[:5], keep='first').sum()
+        uniqueness = 1.0 - (n_dupes / len(df))
+    else:
+        uniqueness = 1.0
+    overall = 0.40 * completeness + 0.35 * consistency + 0.25 * uniqueness
+    return {
+        "completeness": round(completeness, 4),
+        "consistency": round(consistency, 4),
+        "uniqueness": round(uniqueness, 4),
+        "overall": round(overall, 4),
+    }
+def _is_numeric(val) -> bool:
+    """Check if a value is numeric (or null, which is valid)."""
+    if pd.isna(val):
+        return True
+    try:
+        float(val)
+        return True
+    except (ValueError, TypeError):
+        return False
+def compute_dq_score_with_lfs(df: pd.DataFrame, domain: str,
+                               lfs: list) -> float:
+    """Compute DQ score using Snorkel-style labeling functions with majority vote."""
+    if not lfs or len(df) == 0:
+        config = DOMAINS.get(domain)
+        if config:
+            return compute_dq_score(df, config)["overall"]
+        return 0.5
+    ABSTAIN, BAD, GOOD = -1, 0, 1
+    row_scores = []
+    for _, row in df.iterrows():
+        votes = []
+        for lf in lfs:
+            try:
+                vote = lf(row)
+                if vote != ABSTAIN:
+                    votes.append(vote)
+            except Exception:
+                continue
+        if votes:
+            row_scores.append(sum(v == GOOD for v in votes) / len(votes))
+        else:
+            row_scores.append(0.5)
+    return float(np.mean(row_scores))
+def format_preview(df: pd.DataFrame, n: int = 5) -> str:
+    """Format first n rows as a text table."""
+    return df.head(n).to_string(index=False, max_colwidth=30)
+def format_columns_info(df: pd.DataFrame, domain_config: DomainConfig) -> str:
+    """Format column info: name, dtype, null count."""
+    lines = []
+    for col in df.columns:
+        null_count = df[col].isnull().sum()
+        dtype = str(df[col].dtype)
+        expected = "expected" if col in domain_config.columns else "extra"
+        lines.append(f"{col}: {dtype}, nulls={null_count} ({expected})")
+    return "\n".join(lines)

environments/shared/personas.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""Generalized personas for domain-independent question answering."""
+import re
+from pydantic import BaseModel
+class Persona(BaseModel):
+    name: str
+    role: str
+    focus_areas: list[str]
+    language_style: str
+    keywords: list[str]
+    anti_keywords: list[str]
+PERSONAS = [
+    Persona(
+        name="Executive",
+        role="executive",
+        focus_areas=["costs", "ROI", "strategic risk", "portfolio trends", "year-over-year"],
+        language_style="strategic-financial",
+        keywords=["revenue", "cost", "ROI", "risk", "trend", "quarter",
+                   "year-over-year", "impact", "budget", "margin", "growth"],
+        anti_keywords=["I think", "maybe", "um", "idk"],
+    ),
+    Persona(
+        name="Manager",
+        role="manager",
+        focus_areas=["team performance", "operational health", "process bottlenecks", "capacity"],
+        language_style="operational-actionable",
+        keywords=["team", "performance", "bottleneck", "capacity", "SLA",
+                   "process", "action", "priority", "escalation", "delivery"],
+        anti_keywords=["shareholder", "valuation", "IPO"],
+    ),
+    Persona(
+        name="Individual Contributor",
+        role="ic",
+        focus_areas=["personal tasks", "deadlines", "what to do next", "simple explanations"],
+        language_style="plain-personal",
+        keywords=["my", "I should", "next step", "deadline", "help",
+                   "understand", "priority", "task", "assigned"],
+        anti_keywords=["KPI", "ROI", "portfolio", "strategic", "EBITDA"],
+    ),
+]
+PERSONA_MAP = {p.role: p for p in PERSONAS}
+def get_persona(role: str) -> Persona:
+    """Get a persona by role name."""
+    return PERSONA_MAP[role]
+def score_persona_alignment(answer: str, persona: Persona) -> float:
+    """Score how well an answer aligns with a persona's communication style.
+    Returns a float 0-1 based on:
+    - Keyword density (presence of expected terms)
+    - Anti-keyword penalty (presence of terms that don't fit)
+    - Formality check (matches language style)
+    """
+    answer_lower = answer.lower()
+    words = re.findall(r'\w+', answer_lower)
+    word_count = max(len(words), 1)
+    # Keyword scoring: fraction of persona keywords found
+    keyword_hits = sum(1 for kw in persona.keywords if kw.lower() in answer_lower)
+    keyword_score = min(keyword_hits / max(len(persona.keywords) * 0.3, 1), 1.0)
+    # Anti-keyword penalty
+    anti_hits = sum(1 for akw in persona.anti_keywords if akw.lower() in answer_lower)
+    anti_penalty = min(anti_hits * 0.15, 0.5)
+    # Formality check
+    formality_score = _check_formality(answer, persona.language_style)
+    # Combine: 50% keywords, 20% formality, 30% base (minus anti-penalty)
+    raw_score = 0.50 * keyword_score + 0.20 * formality_score + 0.30 - anti_penalty
+    return round(max(0.0, min(1.0, raw_score)), 4)
+def _check_formality(text: str, style: str) -> float:
+    """Check if text formality matches the expected language style."""
+    text_lower = text.lower()
+    if style == "strategic-financial":
+        indicators = ["percent", "%", "million", "billion", "quarter", "fiscal",
+                       "forecast", "benchmark", "metric"]
+        hits = sum(1 for ind in indicators if ind in text_lower)
+        return min(hits / 2.0, 1.0)
+    elif style == "operational-actionable":
+        indicators = ["action", "recommend", "should", "priority", "next steps",
+                       "immediate", "plan", "schedule"]
+        hits = sum(1 for ind in indicators if ind in text_lower)
+        return min(hits / 2.0, 1.0)
+    elif style == "plain-personal":
+        # Plain style rewards shorter sentences and simple language
+        sentences = text.split(".")
+        avg_len = sum(len(s.split()) for s in sentences) / max(len(sentences), 1)
+        return 1.0 if avg_len < 20 else max(0.0, 1.0 - (avg_len - 20) / 30)
+    return 0.5

environments/shared/reward_utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Reward computation with cached downstream signals."""
+# Cached downstream signals mapping quality buckets to historical scores.
+# These represent how well downstream stages perform given upstream quality.
+DOWNSTREAM_CACHE: dict[str, float] = {
+    "excellent": 0.95,  # DQ > 0.90 or coverage > 0.80
+    "good": 0.75,       # DQ 0.70-0.90 or coverage 0.50-0.80
+    "fair": 0.50,       # DQ 0.50-0.70 or coverage 0.30-0.50
+    "poor": 0.20,       # DQ < 0.50 or coverage < 0.30
+}
+def _get_downstream_bucket(score: float) -> str:
+    """Map a score to a downstream quality bucket."""
+    if score > 0.90:
+        return "excellent"
+    elif score > 0.70:
+        return "good"
+    elif score > 0.50:
+        return "fair"
+    return "poor"
+def cleaning_reward(dq_score: float, downstream_bucket: str = "") -> float:
+    """Compute cleaning stage reward.
+    0.70 * dq_score + 0.30 * downstream_signal
+    """
+    if not downstream_bucket:
+        downstream_bucket = _get_downstream_bucket(dq_score)
+    downstream = DOWNSTREAM_CACHE.get(downstream_bucket, 0.5)
+    return round(0.70 * dq_score + 0.30 * downstream, 4)
+def enrichment_reward(coverage: float, downstream_bucket: str = "") -> float:
+    """Compute enrichment stage reward.
+    0.50 * coverage + 0.50 * downstream_signal
+    """
+    if not downstream_bucket:
+        downstream_bucket = _get_downstream_bucket(coverage)
+    downstream = DOWNSTREAM_CACHE.get(downstream_bucket, 0.5)
+    return round(0.50 * coverage + 0.50 * downstream, 4)
+def answering_reward(faithfulness: float, persona_relevance: float,
+                     patronus_score: float | None = None) -> float:
+    """Compute answering stage reward.
+    Without Patronus: 0.30 * faithfulness + 0.70 * persona_relevance
+    With Patronus:    0.40 * patronus_faithfulness + 0.60 * persona_relevance
+    """
+    if patronus_score is not None:
+        return round(0.40 * patronus_score + 0.60 * persona_relevance, 4)
+    return round(0.30 * faithfulness + 0.70 * persona_relevance, 4)

models.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Data models for the DataSage Cleaning Environment."""
+from openenv.core.env_server.types import Action, Observation
+from pydantic import Field
+from typing import Optional
+class CleaningAction(Action):
+    """Action for the Cleaning environment - a data cleaning operation."""
+    operation: str = Field(
+        ...,
+        description="Cleaning operation: fill_null|fix_type|remove_duplicate|standardize|trim|correct_typo",
+    )
+    column: str = Field(..., description="Target column name")
+    value: Optional[str] = Field(
+        None,
+        description="Replacement value or rule (e.g., 'median', 'mode', a specific value)",
+    )
+    params: dict = Field(default_factory=dict)
+class CleaningObservation(Observation):
+    """Observation from the Cleaning environment - data quality state."""
+    domain: str = Field(default="", description="Current domain: hr|sales|pm|it_ops")
+    data_preview: str = Field(default="", description="First 5 rows as text table")
+    dq_report: str = Field(
+        default="",
+        description="Completeness, consistency, uniqueness breakdown",
+    )
+    dq_score: float = Field(default=0.0, description="Overall DQ score 0-1")
+    columns_info: str = Field(
+        default="", description="Column names, types, null counts"
+    )
+    step_number: int = Field(default=0)
+    max_steps: int = Field(default=15)

openenv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+spec_version: 1
+name: datasage_cleaning
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

pyproject.toml ADDED Viewed

	@@ -0,0 +1,28 @@

+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-datasage-cleaning"
+version = "0.1.0"
+description = "DataSage Cleaning environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    "openenv-core[core]>=0.2.1",
+    "pandas>=2.0",
+    "numpy>=1.24",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+server = "datasage_cleaning.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["datasage_cleaning", "datasage_cleaning.server"]
+package-dir = { "datasage_cleaning" = ".", "datasage_cleaning.server" = "server" }

server/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Cleaning environment server components."""
+from .cleaning_environment import CleaningEnvironment
+__all__ = ["CleaningEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+FastAPI application for the DataSage Cleaning Environment.
+Endpoints:
+    - POST /reset: Reset the environment
+    - POST /step: Execute an action
+    - GET /state: Get current environment state
+    - GET /schema: Get action/observation schemas
+    - WS /ws: WebSocket endpoint for persistent sessions
+Usage:
+    uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+"""
+try:
+    from openenv.core.env_server.http_server import create_app
+except Exception as e:  # pragma: no cover
+    raise ImportError(
+        "openenv is required. Install with: uv sync"
+    ) from e
+from models import CleaningAction, CleaningObservation
+from .cleaning_environment import CleaningEnvironment
+app = create_app(
+    CleaningEnvironment,
+    CleaningAction,
+    CleaningObservation,
+    env_name="datasage_cleaning",
+    max_concurrent_envs=4,
+)
+def main(host: str = "0.0.0.0", port: int = 8000):
+    """Entry point for direct execution."""
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=8000)
+    args = parser.parse_args()
+    main(port=args.port)

server/cleaning_environment.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+DataSage Cleaning Environment Implementation.
+An RL environment where the agent must clean corrupted enterprise data
+across 4 domains (HR, Sales, PM, IT Ops) by applying cleaning operations
+to maximise the data quality score.
+"""
+import random
+import sys
+import os
+from uuid import uuid4
+import numpy as np
+import pandas as pd
+# Allow imports from the project root so shared modules are reachable.
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', '..'))
+from environments.shared.domains import DOMAINS
+from environments.shared.enterprise_data import (
+    load_domain_data,
+    inject_corruption,
+    compute_dq_score,
+    format_preview,
+    format_columns_info,
+)
+from environments.shared.reward_utils import cleaning_reward
+from models import CleaningAction, CleaningObservation
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+class CleaningEnvironment(Environment):
+    """
+    Cleaning environment: the agent fixes data quality issues in a
+    50-row enterprise data batch.
+    Supported operations:
+        fill_null   - fill missing values (median / mode / explicit value)
+        fix_type    - cast a column to numeric, coercing errors to NaN
+        remove_duplicate - drop duplicate rows
+        standardize - strip whitespace and normalise case (lower / title)
+        trim        - strip leading/trailing whitespace
+        correct_typo - replace a typo with a correct value
+    """
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self):
+        """Initialise the cleaning environment."""
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._df: pd.DataFrame = pd.DataFrame()
+        self._domain_name: str = ""
+        self._domain_config = None
+        self._max_steps: int = 15
+    # ------------------------------------------------------------------
+    # reset
+    # ------------------------------------------------------------------
+    def reset(self) -> CleaningObservation:
+        """Pick a random domain, load a 50-row batch, inject corruption."""
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._domain_name = random.choice(list(DOMAINS.keys()))
+        self._domain_config = DOMAINS[self._domain_name]
+        # Load raw data and sample 50 rows
+        raw_df = load_domain_data(self._domain_name, sample_size=50)
+        self._df = inject_corruption(raw_df, self._domain_config, rate=0.15)
+        dq = compute_dq_score(self._df, self._domain_config)
+        dq_report = (
+            f"completeness={dq['completeness']:.4f}  "
+            f"consistency={dq['consistency']:.4f}  "
+            f"uniqueness={dq['uniqueness']:.4f}"
+        )
+        return CleaningObservation(
+            domain=self._domain_name,
+            data_preview=format_preview(self._df),
+            dq_report=dq_report,
+            dq_score=dq["overall"],
+            columns_info=format_columns_info(self._df, self._domain_config),
+            step_number=0,
+            max_steps=self._max_steps,
+            done=False,
+            reward=0.0,
+        )
+    # ------------------------------------------------------------------
+    # step
+    # ------------------------------------------------------------------
+    def step(self, action: CleaningAction) -> CleaningObservation:  # type: ignore[override]
+        """Apply a single cleaning operation and return the updated state."""
+        self._state.step_count += 1
+        step = self._state.step_count
+        op = action.operation
+        col = action.column
+        value = action.value
+        # ---- apply operation ----
+        try:
+            if op == "fill_null":
+                self._apply_fill_null(col, value)
+            elif op == "fix_type":
+                self._apply_fix_type(col)
+            elif op == "remove_duplicate":
+                self._apply_remove_duplicate()
+            elif op == "standardize":
+                self._apply_standardize(col, value)
+            elif op == "trim":
+                self._apply_trim(col)
+            elif op == "correct_typo":
+                self._apply_correct_typo(col, value, action.params)
+            # unknown ops are silently ignored (no crash)
+        except Exception:
+            pass  # invalid column, etc. -> no-op
+        # ---- compute DQ and reward ----
+        dq = compute_dq_score(self._df, self._domain_config)
+        reward = cleaning_reward(dq["overall"])
+        done = dq["overall"] > 0.95 or step >= self._max_steps
+        dq_report = (
+            f"completeness={dq['completeness']:.4f}  "
+            f"consistency={dq['consistency']:.4f}  "
+            f"uniqueness={dq['uniqueness']:.4f}"
+        )
+        return CleaningObservation(
+            domain=self._domain_name,
+            data_preview=format_preview(self._df),
+            dq_report=dq_report,
+            dq_score=dq["overall"],
+            columns_info=format_columns_info(self._df, self._domain_config),
+            step_number=step,
+            max_steps=self._max_steps,
+            done=done,
+            reward=reward,
+            metadata={
+                "operation": op,
+                "column": col,
+                "step": step,
+            },
+        )
+    # ------------------------------------------------------------------
+    # state property
+    # ------------------------------------------------------------------
+    @property
+    def state(self) -> State:
+        """Return current environment state."""
+        return self._state
+    # ------------------------------------------------------------------
+    # operation helpers
+    # ------------------------------------------------------------------
+    def _apply_fill_null(self, col: str, value: str | None) -> None:
+        if col not in self._df.columns:
+            return
+        if value == "median":
+            numeric = pd.to_numeric(self._df[col], errors="coerce")
+            fill_val = numeric.median()
+        elif value == "mode":
+            mode_vals = self._df[col].mode()
+            fill_val = mode_vals.iloc[0] if len(mode_vals) > 0 else ""
+        else:
+            fill_val = value if value is not None else ""
+        self._df[col] = self._df[col].fillna(fill_val)
+    def _apply_fix_type(self, col: str) -> None:
+        if col not in self._df.columns:
+            return
+        self._df[col] = pd.to_numeric(self._df[col], errors="coerce")
+    def _apply_remove_duplicate(self) -> None:
+        available = [c for c in self._domain_config.columns if c in self._df.columns]
+        self._df = self._df.drop_duplicates(
+            subset=available[:5], keep="first"
+        ).reset_index(drop=True)
+    def _apply_standardize(self, col: str, value: str | None) -> None:
+        if col not in self._df.columns:
+            return
+        self._df[col] = self._df[col].astype(str).str.strip()
+        if value == "lower":
+            self._df[col] = self._df[col].str.lower()
+        elif value == "title":
+            self._df[col] = self._df[col].str.title()
+    def _apply_trim(self, col: str) -> None:
+        if col not in self._df.columns:
+            return
+        self._df[col] = self._df[col].astype(str).str.strip()
+    def _apply_correct_typo(self, col: str, value: str | None,
+                            params: dict) -> None:
+        if col not in self._df.columns or value is None:
+            return
+        wrong = params.get("wrong")
+        if wrong:
+            self._df[col] = self._df[col].replace(wrong, value)
+        else:
+            # If no specific wrong value given, try to replace the most
+            # uncommon value with the provided correct value.
+            counts = self._df[col].value_counts()
+            if len(counts) > 1:
+                least_common = counts.index[-1]
+                self._df[col] = self._df[col].replace(least_common, value)

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+openenv[core]>=0.2.0
+fastapi>=0.115.0
+uvicorn>=0.24.0
+pandas>=2.0
+numpy>=1.24

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff