Spaces:
Sleeping
Sleeping
File size: 2,406 Bytes
3dc09f6 0d6e70e da762f1 cffdf8b 3dc09f6 cffdf8b 9fbf203 cffdf8b 9fbf203 cffdf8b 0d6e70e da762f1 45fc6e6 cffdf8b 0d6e70e 45fc6e6 cffdf8b 0d6e70e 9fbf203 cffdf8b 3dc09f6 fd1507d 3dc09f6 9fbf203 3dc09f6 fd1507d 3dc09f6 fd1507d 3dc09f6 cffdf8b 3dc09f6 fd1507d 3dc09f6 fd1507d 3dc09f6 0d6e70e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import json, re, ast, streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import os
# NEW MODEL: Phi-2 - Does NOT use sentencepiece
model_id = "microsoft/phi-2"
# Token is NOT needed for Phi-2
# HF_TOKEN = os.environ.get("HF_TOKEN") # Removed
tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# Model loading remains the same
try:
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
# token=HF_TOKEN # Removed
)
except Exception:
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
# token=HF_TOKEN # Removed
)
# ... rest of the pipeline and extraction code is the same ...
gen = pipeline("text-generation", model=model, tokenizer=tok,
max_new_tokens=256, do_sample=False, return_full_text=False)
prompt = """Extract skills and knowledge from the text.
Return JSON: {"SKILL":[...], "KNOWLEDGE":[...]}.
Text: {text}
JSON:"""
def extract(text: str):
out = gen(prompt.format(text=text))
raw = out[0].get("generated_text") or out[0].get("text") or str(out[0])
m = re.search(r"(\{[\s\S]*\})", raw)
data = {}
if m:
blob = m.group(0).strip()
for parser in (json.loads, ast.literal_eval):
try:
parsed_data = parser(blob)
if isinstance(parsed_data, list) and parsed_data:
data = parsed_data[0]
elif isinstance(parsed_data, dict):
data = parsed_data
break
except Exception:
continue
if not isinstance(data, dict):
# NOTE: You are now hitting a KeyError: "SKILL" (image_36e619.png).
# This is because the model returned bad JSON. This is the code that handles it:
return {
"SKILL": ["(Error: Invalid/Corrupted Model Output)"],
"KNOWLEDGE": [],
"DEBUG_RAW_OUTPUT": raw
}
return {
"SKILL": data.get("SKILL", []),
"KNOWLEDGE": data.get("KNOWLEDGE", [])
}
st.title("Skill/Knowledge Extractor")
text = st.text_area("Paste text")
if st.button("Extract") and text.strip():
st.json(extract(text)) |