File size: 1,253 Bytes
4a92139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import json, re, ast, streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tok = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
gen = pipeline("text-generation", model=model, tokenizer=tok,
               max_new_tokens=256, do_sample=False, return_full_text=False)

prompt = """Extract skills and knowledge from the text.

Return JSON: {"SKILL":[...], "KNOWLEDGE":[...]}.

Text: {text}

JSON:"""

def extract(text: str):
    out = gen(prompt.format(text=text))
    raw = out[0].get("generated_text") or out[0].get("text") or str(out[0])
    m = re.search(r"\{[\s\S]*\}", raw)
    data = {}
    if m:
        blob = m.group(0)
        for parser in (json.loads, ast.literal_eval):
            try:
                data = parser(blob); break
            except Exception: pass
    if not isinstance(data, dict):
        data = {}
    return {"SKILL": data.get("SKILL", []), "KNOWLEDGE": data.get("KNOWLEDGE", [])}

st.title("Skill/Knowledge Extractor")
text = st.text_area("Paste text")
if st.button("Extract") and text.strip():
    st.json(extract(text))