import os from fastapi import FastAPI from datasets import load_dataset import random import json import re # Set Hugging Face cache directory programmatically as a fallback os.environ["HF_HOME"] = "/app/cache" app = FastAPI() # Load the dataset from Hugging Face, selecting required columns including sentence_type dataset = load_dataset("UCSC-Admire/idiom-SFT-dataset-561-2024-12-06_00-40-30", split="train").select_columns(["compound", "sentence", "output", "sentence_type"]) # Filter dataset to include only records with sentence_type = "idiomatic" dataset = dataset.filter(lambda x: x["sentence_type"] == "idiomatic") @app.get("/api/idioms") async def get_idioms(): idioms = random.sample(list(dataset), 50) response = [] for item in idioms: raw_output = item.get("output", "") # Strip tags using regex cleaned_output = re.sub(r"", "", raw_output).strip() # Try parsing the JSON to get 'Compound Meaning' try: output_json = json.loads(cleaned_output) compound_meaning = output_json.get("Compound Meaning", "") except json.JSONDecodeError: compound_meaning = "" # Fallback if JSON is invalid response.append({ "idiom": item.get("compound", ""), "example": item.get("sentence", ""), "definition": compound_meaning, }) return response