Spaces:

learneng-space
/

idioms

Sleeping

File size: 1,417 Bytes

f232d53
7ec4963
d7ab29f
67a207f
d7ab29f
 
67a207f
f232d53
 
 
7ec4963
67a207f
d7ab29f
 
770e3a9
d7ab29f
 
67a207f
7ec4963
 
d7ab29f
67a207f
20c79ed
67a207f
d7ab29f
 
 
 
 
 
 
 
 
 
 
67a207f
d7ab29f
 
 
67a207f
20c79ed
39cbc9d

import os
from fastapi import FastAPI
from datasets import load_dataset
import random
import json
import re

# Set Hugging Face cache directory programmatically as a fallback
os.environ["HF_HOME"] = "/app/cache"

app = FastAPI()

# Load the dataset from Hugging Face, selecting required columns including sentence_type
dataset = load_dataset("UCSC-Admire/idiom-SFT-dataset-561-2024-12-06_00-40-30", split="train").select_columns(["compound", "sentence", "output", "sentence_type"])

# Filter dataset to include only records with sentence_type = "idiomatic"
dataset = dataset.filter(lambda x: x["sentence_type"] == "idiomatic")

@app.get("/api/idioms")
async def get_idioms():
    idioms = random.sample(list(dataset), 50)
    response = []

    for item in idioms:
        raw_output = item.get("output", "")
        # Strip <output> tags using regex
        cleaned_output = re.sub(r"</?output>", "", raw_output).strip()

        # Try parsing the JSON to get 'Compound Meaning'
        try:
            output_json = json.loads(cleaned_output)
            compound_meaning = output_json.get("Compound Meaning", "")
        except json.JSONDecodeError:
            compound_meaning = ""  # Fallback if JSON is invalid

        response.append({
            "idiom": item.get("compound", ""),
            "example": item.get("sentence", ""),
            "definition": compound_meaning,
        })

    return response