File size: 2,406 Bytes
3dc09f6
0d6e70e
da762f1
cffdf8b
3dc09f6
cffdf8b
 
9fbf203
cffdf8b
 
 
9fbf203
cffdf8b
0d6e70e
 
 
da762f1
45fc6e6
cffdf8b
 
0d6e70e
 
 
 
 
45fc6e6
cffdf8b
 
0d6e70e
9fbf203
cffdf8b
3dc09f6
 
 
 
 
 
 
 
 
 
 
fd1507d
3dc09f6
9fbf203
3dc09f6
fd1507d
3dc09f6
 
fd1507d
 
 
 
 
 
 
 
 
3dc09f6
cffdf8b
 
3dc09f6
 
 
 
 
fd1507d
3dc09f6
 
 
 
 
 
 
fd1507d
3dc09f6
0d6e70e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import json, re, ast, streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch 
import os 

# NEW MODEL: Phi-2 - Does NOT use sentencepiece
model_id = "microsoft/phi-2"

# Token is NOT needed for Phi-2
# HF_TOKEN = os.environ.get("HF_TOKEN") # Removed
tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 

# Model loading remains the same
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        torch_dtype=torch.bfloat16, 
        device_map="auto",
        trust_remote_code=True
        # token=HF_TOKEN # Removed
    )
except Exception:
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        torch_dtype=torch.float16, 
        device_map="auto",
        trust_remote_code=True
        # token=HF_TOKEN # Removed
    )

# ... rest of the pipeline and extraction code is the same ...
gen = pipeline("text-generation", model=model, tokenizer=tok,
               max_new_tokens=256, do_sample=False, return_full_text=False)

prompt = """Extract skills and knowledge from the text.
Return JSON: {"SKILL":[...], "KNOWLEDGE":[...]}.
Text: {text}
JSON:"""

def extract(text: str):
    out = gen(prompt.format(text=text))
    raw = out[0].get("generated_text") or out[0].get("text") or str(out[0])
    m = re.search(r"(\{[\s\S]*\})", raw)
    data = {}
    
    if m:
        blob = m.group(0).strip()
        for parser in (json.loads, ast.literal_eval):
            try:
                parsed_data = parser(blob)
                if isinstance(parsed_data, list) and parsed_data:
                    data = parsed_data[0]
                elif isinstance(parsed_data, dict):
                    data = parsed_data
                break
            except Exception:
                continue

    if not isinstance(data, dict):
        # NOTE: You are now hitting a KeyError: "SKILL" (image_36e619.png). 
        # This is because the model returned bad JSON. This is the code that handles it:
        return {
            "SKILL": ["(Error: Invalid/Corrupted Model Output)"],
            "KNOWLEDGE": [],
            "DEBUG_RAW_OUTPUT": raw
        }

    return {
        "SKILL": data.get("SKILL", []),
        "KNOWLEDGE": data.get("KNOWLEDGE", [])
    }

st.title("Skill/Knowledge Extractor")
text = st.text_area("Paste text")

if st.button("Extract") and text.strip():
    st.json(extract(text))