Spaces:
Paused
Paused
Tao Wu commited on
Commit ·
22f807c
1
Parent(s): 381ef72
add explanation
Browse files- app/app.py +27 -3
- app/embedding_setup.py +67 -3
app/app.py
CHANGED
|
@@ -5,8 +5,8 @@ import json
|
|
| 5 |
import requests
|
| 6 |
from config import *
|
| 7 |
import functools
|
| 8 |
-
from embedding_setup import retriever, find_similar_occupation,
|
| 9 |
-
from data_process import
|
| 10 |
with open('/app/data/redis_data.json', 'r') as file:
|
| 11 |
data_dict = json.load(file)
|
| 12 |
#r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True)
|
|
@@ -40,12 +40,36 @@ def retrieve_documents(occupation,skills):
|
|
| 40 |
sorted_docs = sorted(docs, key=functools.cmp_to_key(partial_compare_docs), reverse=True)
|
| 41 |
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
|
| 44 |
output.append(f"<b>Empfohlene Kurse:</b>")
|
| 45 |
-
for doc in sorted_docs:
|
| 46 |
doc_name = doc.metadata.get('name', 'Unnamed Document')
|
| 47 |
doc_url = doc.metadata.get('url', '#')
|
| 48 |
output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
|
|
|
|
|
|
|
|
|
|
| 49 |
output.append(f"<br>")
|
| 50 |
return "<br>".join(output)
|
| 51 |
|
|
|
|
| 5 |
import requests
|
| 6 |
from config import *
|
| 7 |
import functools
|
| 8 |
+
from embedding_setup import retriever, find_similar_occupation, compare_docs_with_context,generate_exp,generate_prompt_exp
|
| 9 |
+
from data_process import get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
|
| 10 |
with open('/app/data/redis_data.json', 'r') as file:
|
| 11 |
data_dict = json.load(file)
|
| 12 |
#r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True)
|
|
|
|
| 40 |
sorted_docs = sorted(docs, key=functools.cmp_to_key(partial_compare_docs), reverse=True)
|
| 41 |
|
| 42 |
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
batch_prompts = []
|
| 47 |
+
for doc in sorted_docs[:5]:
|
| 48 |
+
doc_name = doc.metadata.get('name', 'Unnamed Document')
|
| 49 |
+
doc_skill = doc.metadata.get('skills', '')
|
| 50 |
+
output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
|
| 51 |
+
input_text = f"target occupation: {target_occupation_query}\n courses: name: {doc_name}, learning objectives: {doc_skill}"
|
| 52 |
+
prompt = generate_prompt_exp(input_text)
|
| 53 |
+
batch_prompts.append(prompt)
|
| 54 |
+
|
| 55 |
+
# Evaluate the current batch of prompts
|
| 56 |
+
batch_output = generate_exp(batch_prompts)
|
| 57 |
+
for i in range(5):
|
| 58 |
+
doc = sorted_docs[i]
|
| 59 |
+
doc_name = doc.metadata.get('name', 'Unnamed Document')
|
| 60 |
+
doc_url = doc.metadata.get('url', '#')
|
| 61 |
+
doc_skill = doc.metadata.get('skills', '')
|
| 62 |
+
output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
|
| 63 |
+
output.append(f"<b>Recommendation Explanation:</b> {batch_output[i]}")
|
| 64 |
output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
|
| 65 |
output.append(f"<b>Empfohlene Kurse:</b>")
|
| 66 |
+
for doc in sorted_docs[:5]:
|
| 67 |
doc_name = doc.metadata.get('name', 'Unnamed Document')
|
| 68 |
doc_url = doc.metadata.get('url', '#')
|
| 69 |
output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
|
| 70 |
+
input_text = f"target occupation: {target_occupation_query}\n courses: name: {doc_name['course_name']}, learning objectives: {doc_name['skills']}"
|
| 71 |
+
prompt = generate_prompt_exp(input_text)
|
| 72 |
+
batch_prompts.append(prompt)
|
| 73 |
output.append(f"<br>")
|
| 74 |
return "<br>".join(output)
|
| 75 |
|
app/embedding_setup.py
CHANGED
|
@@ -31,8 +31,8 @@ retriever = db.as_retriever(search_kwargs={"k": TOP_K})
|
|
| 31 |
|
| 32 |
|
| 33 |
LLM_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
|
| 34 |
-
|
| 35 |
-
|
| 36 |
hf_auth = os.environ.get("hf_token")
|
| 37 |
|
| 38 |
|
|
@@ -53,11 +53,12 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
| 53 |
|
| 54 |
rec_adapter = PeftModel.from_pretrained(
|
| 55 |
model,
|
| 56 |
-
|
| 57 |
torch_dtype=torch.float16,
|
| 58 |
device_map={'': 0}
|
| 59 |
)
|
| 60 |
|
|
|
|
| 61 |
tokenizer.padding_side = "left"
|
| 62 |
# unwind broken decapoda-research config
|
| 63 |
#model.half() # seems to fix bugs for some users.
|
|
@@ -67,6 +68,8 @@ rec_adapter.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
|
|
| 67 |
rec_adapter.config.bos_token_id = 1
|
| 68 |
rec_adapter.config.eos_token_id = 2
|
| 69 |
|
|
|
|
|
|
|
| 70 |
def generate_prompt(target_occupation, skill_gap, courses):
|
| 71 |
return f"""
|
| 72 |
### Instruction:
|
|
@@ -147,6 +150,67 @@ def compare_docs_with_context(doc_a, doc_b, df_course, target_occupation_name, t
|
|
| 147 |
else:
|
| 148 |
return 0 # Consider them equal if the response is unclear
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
|
| 151 |
|
| 152 |
# Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
LLM_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
|
| 34 |
+
lora_weights_rec = "wt3639/Llama-3-8B-Instruct_CourseRec_lora"
|
| 35 |
+
lora_weights_exp = "wt3639/Llama-3-8B-Instruct_RecExp_lora"
|
| 36 |
hf_auth = os.environ.get("hf_token")
|
| 37 |
|
| 38 |
|
|
|
|
| 53 |
|
| 54 |
rec_adapter = PeftModel.from_pretrained(
|
| 55 |
model,
|
| 56 |
+
lora_weights_rec,
|
| 57 |
torch_dtype=torch.float16,
|
| 58 |
device_map={'': 0}
|
| 59 |
)
|
| 60 |
|
| 61 |
+
|
| 62 |
tokenizer.padding_side = "left"
|
| 63 |
# unwind broken decapoda-research config
|
| 64 |
#model.half() # seems to fix bugs for some users.
|
|
|
|
| 68 |
rec_adapter.config.bos_token_id = 1
|
| 69 |
rec_adapter.config.eos_token_id = 2
|
| 70 |
|
| 71 |
+
|
| 72 |
+
|
| 73 |
def generate_prompt(target_occupation, skill_gap, courses):
|
| 74 |
return f"""
|
| 75 |
### Instruction:
|
|
|
|
| 150 |
else:
|
| 151 |
return 0 # Consider them equal if the response is unclear
|
| 152 |
|
| 153 |
+
|
| 154 |
+
#-----------------------------------------explanation-------------------------------------
|
| 155 |
+
exp_adapter = PeftModel.from_pretrained(
|
| 156 |
+
model,
|
| 157 |
+
lora_weights_exp,
|
| 158 |
+
torch_dtype=torch.float16,
|
| 159 |
+
device_map={'': 0}
|
| 160 |
+
)
|
| 161 |
+
exp_adapter.eval()
|
| 162 |
+
|
| 163 |
+
exp_adapter.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
|
| 164 |
+
exp_adapter.config.bos_token_id = 1
|
| 165 |
+
exp_adapter.config.eos_token_id = 2
|
| 166 |
+
|
| 167 |
+
def generate_prompt_exp(input_text):
|
| 168 |
+
return f"""
|
| 169 |
+
### Instruction:
|
| 170 |
+
As an education expert, you have been provided with target occupations and recommended course information. Your task is to explain the recommendation in German.
|
| 171 |
+
|
| 172 |
+
### Input:
|
| 173 |
+
{input_text}
|
| 174 |
+
|
| 175 |
+
### Response:
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
def generate_exp(
|
| 179 |
+
prompt=None,
|
| 180 |
+
temperature=0,
|
| 181 |
+
top_p=1.0,
|
| 182 |
+
top_k=40,
|
| 183 |
+
num_beams=1,
|
| 184 |
+
max_new_tokens=140,
|
| 185 |
+
batch_size=1,
|
| 186 |
+
**kwargs,
|
| 187 |
+
):
|
| 188 |
+
|
| 189 |
+
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
|
| 190 |
+
generation_config = GenerationConfig(
|
| 191 |
+
temperature=temperature,
|
| 192 |
+
top_p=top_p,
|
| 193 |
+
top_k=top_k,
|
| 194 |
+
num_beams=num_beams,
|
| 195 |
+
**kwargs,
|
| 196 |
+
)
|
| 197 |
+
with torch.no_grad():
|
| 198 |
+
generation_output = model.generate(
|
| 199 |
+
**inputs,
|
| 200 |
+
generation_config=generation_config,
|
| 201 |
+
return_dict_in_generate=True,
|
| 202 |
+
output_scores=True,
|
| 203 |
+
max_new_tokens=max_new_tokens,
|
| 204 |
+
# batch_size=batch_size,
|
| 205 |
+
eos_token_id=tokenizer.eos_token_id,
|
| 206 |
+
pad_token_id=tokenizer.eos_token_id,
|
| 207 |
+
)
|
| 208 |
+
s = generation_output.sequences
|
| 209 |
+
output = tokenizer.batch_decode(s, skip_special_tokens=True)
|
| 210 |
+
output = [_.split('Response:\n')[-1] for _ in output]
|
| 211 |
+
return output
|
| 212 |
+
|
| 213 |
+
|
| 214 |
def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
|
| 215 |
|
| 216 |
# Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.
|