import os
from fastapi import FastAPI
from datasets import load_dataset
import random
import json
import re

# Set Hugging Face cache directory programmatically as a fallback
os.environ["HF_HOME"] = "/app/cache"

app = FastAPI()

# Load the dataset from Hugging Face, selecting required columns including sentence_type
dataset = load_dataset("UCSC-Admire/idiom-SFT-dataset-561-2024-12-06_00-40-30", split="train").select_columns(["compound", "sentence", "output", "sentence_type"])

# Filter dataset to include only records with sentence_type = "idiomatic"
dataset = dataset.filter(lambda x: x["sentence_type"] == "idiomatic")

@app.get("/api/idioms")
async def get_idioms():
    idioms = random.sample(list(dataset), 50)
    response = []

    for item in idioms:
        raw_output = item.get("output", "")
        # Strip <output> tags using regex
        cleaned_output = re.sub(r"</?output>", "", raw_output).strip()

        # Try parsing the JSON to get 'Compound Meaning'
        try:
            output_json = json.loads(cleaned_output)
            compound_meaning = output_json.get("Compound Meaning", "")
        except json.JSONDecodeError:
            compound_meaning = ""  # Fallback if JSON is invalid

        response.append({
            "idiom": item.get("compound", ""),
            "example": item.get("sentence", ""),
            "definition": compound_meaning,
        })

    return response