import json, re, ast, streamlit as st from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import torch import os # NEW MODEL: Phi-2 - Does NOT use sentencepiece model_id = "microsoft/phi-2" # Token is NOT needed for Phi-2 # HF_TOKEN = os.environ.get("HF_TOKEN") # Removed tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) # Model loading remains the same try: model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True # token=HF_TOKEN # Removed ) except Exception: model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True # token=HF_TOKEN # Removed ) # ... rest of the pipeline and extraction code is the same ... gen = pipeline("text-generation", model=model, tokenizer=tok, max_new_tokens=256, do_sample=False, return_full_text=False) prompt = """Extract skills and knowledge from the text. Return JSON: {"SKILL":[...], "KNOWLEDGE":[...]}. Text: {text} JSON:""" def extract(text: str): out = gen(prompt.format(text=text)) raw = out[0].get("generated_text") or out[0].get("text") or str(out[0]) m = re.search(r"(\{[\s\S]*\})", raw) data = {} if m: blob = m.group(0).strip() for parser in (json.loads, ast.literal_eval): try: parsed_data = parser(blob) if isinstance(parsed_data, list) and parsed_data: data = parsed_data[0] elif isinstance(parsed_data, dict): data = parsed_data break except Exception: continue if not isinstance(data, dict): # NOTE: You are now hitting a KeyError: "SKILL" (image_36e619.png). # This is because the model returned bad JSON. This is the code that handles it: return { "SKILL": ["(Error: Invalid/Corrupted Model Output)"], "KNOWLEDGE": [], "DEBUG_RAW_OUTPUT": raw } return { "SKILL": data.get("SKILL", []), "KNOWLEDGE": data.get("KNOWLEDGE", []) } st.title("Skill/Knowledge Extractor") text = st.text_area("Paste text") if st.button("Extract") and text.strip(): st.json(extract(text))