import json, re, ast, streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch 
import os 

# NEW MODEL: Phi-2 - Does NOT use sentencepiece
model_id = "microsoft/phi-2"

# Token is NOT needed for Phi-2
# HF_TOKEN = os.environ.get("HF_TOKEN") # Removed
tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 

# Model loading remains the same
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        torch_dtype=torch.bfloat16, 
        device_map="auto",
        trust_remote_code=True
        # token=HF_TOKEN # Removed
    )
except Exception:
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        torch_dtype=torch.float16, 
        device_map="auto",
        trust_remote_code=True
        # token=HF_TOKEN # Removed
    )

# ... rest of the pipeline and extraction code is the same ...
gen = pipeline("text-generation", model=model, tokenizer=tok,
               max_new_tokens=256, do_sample=False, return_full_text=False)

prompt = """Extract skills and knowledge from the text.
Return JSON: {"SKILL":[...], "KNOWLEDGE":[...]}.
Text: {text}
JSON:"""

def extract(text: str):
    out = gen(prompt.format(text=text))
    raw = out[0].get("generated_text") or out[0].get("text") or str(out[0])
    m = re.search(r"(\{[\s\S]*\})", raw)
    data = {}
    
    if m:
        blob = m.group(0).strip()
        for parser in (json.loads, ast.literal_eval):
            try:
                parsed_data = parser(blob)
                if isinstance(parsed_data, list) and parsed_data:
                    data = parsed_data[0]
                elif isinstance(parsed_data, dict):
                    data = parsed_data
                break
            except Exception:
                continue

    if not isinstance(data, dict):
        # NOTE: You are now hitting a KeyError: "SKILL" (image_36e619.png). 
        # This is because the model returned bad JSON. This is the code that handles it:
        return {
            "SKILL": ["(Error: Invalid/Corrupted Model Output)"],
            "KNOWLEDGE": [],
            "DEBUG_RAW_OUTPUT": raw
        }

    return {
        "SKILL": data.get("SKILL", []),
        "KNOWLEDGE": data.get("KNOWLEDGE", [])
    }

st.title("Skill/Knowledge Extractor")
text = st.text_area("Paste text")

if st.button("Extract") and text.strip():
    st.json(extract(text))