File size: 7,145 Bytes
949f9a6 f8705bd 7099dd8 3dd8d72 7099dd8 949f9a6 d86218d 949f9a6 7099dd8 3d273d1 7099dd8 3d273d1 7099dd8 3d273d1 7099dd8 a0ae1b2 7099dd8 d9f64f4 7099dd8 949f9a6 ee380fc 533dd31 949f9a6 a0ae1b2 949f9a6 7099dd8 949f9a6 7099dd8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
import joblib
import xgboost as xgb
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np
import pandas as pd
from supabase import create_client, Client
from google import genai
import os
apikey = os.getenv('gemini_secret')
client = genai.Client(api_key=apikey)
supabase_url = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
supabase_anon_key = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
supabase: Client = create_client(supabase_url, supabase_anon_key)
# Load BERT model & tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=5,output_hidden_states=True)
from safetensors.torch import load_file
state_dict = load_file("model (4).safetensors")
# Load the state into the model
model.load_state_dict(state_dict,strict=False)
model.eval()
# Load PCA and Scaler
pca = joblib.load("pca.pkl")
scaler = joblib.load("scaler.pkl")
kmean=joblib.load("kmeans_model.pkl")
# Load XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_model.load_model("xgb_model.json")
category_mappings = {
"kmeans_labels": pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int32')
}
def combine_document_features(component, description):
return f"Component is {component} with Description {description}"
def retrieve_similar_documents(component, description,pid):
new_doc_text = combine_document_features(component, description)
inputs = tokenizer(new_doc_text, return_tensors="pt", truncation=True,max_length=512)
with torch.no_grad():
outputs = model(**inputs)
new_embedding = outputs.hidden_states[-1][:, 0, :][0].tolist()
# Get all documents from the "bugs" table
response = supabase.rpc("get_similar_bugs", {
"query_embedding": new_embedding,
"pid": pid
}).execute()
if not response.data:
return None, new_doc_text,new_embedding
outputs=[]
for i in response.data:
outputs.append(i)
return outputs,new_doc_text,new_embedding
def generate_title(soft_prompt, new_document):
soft_prompt_examples = '\n\n'.join(
f'Component: {value[0]}\n Description: {value[1]}\n title: {key}' for key, value in soft_prompt.items()
)
input_prompt = f'''Generate a Bug report title based on description and Component. Do not generate anything else. Try to keep it in a limit of 30 characters.
Look at the below examples to understand the task.
{soft_prompt_examples}
Now using the examples above, try giving the output for the following input enclosed within the | delimiter.
| {new_document} |.
Give the title as instructed above. Give the title only'''
response = client.models.generate_content(model="gemini-2.0-flash", contents=input_prompt)
return response.text
def suggest_title(component, description,pid):
similar_docs, new_doc_text,embedding = retrieve_similar_documents(component,description,pid)
examples={'''Concatenation of string variables slow compared to strings themselves''':['''JavaScript Engine''','''Ill be uploading a test case with various tests of string concatenation. ; Mozilla (build 2000040308) shows good performance with all the ones that uses ; strings directly; e.g. string1 + string2. its the last three it has ; problems with; they use string variables (e.g. var1 + var 2) in the ; concatenation.; ; try it out for yourselves. all numberical values shown in the form fields is ; the execution time in millseconds. the four tests on the left hand side; and ; the 2 at the top on the right hand side finished in around 1650ms on my P3/450. ; this is just the same speed as Netscape Comm 4.72. On the last three tests on ; the right hand side NC4.72 uses 7000ms; 10000ms and around 4500ms respectively; ; while Mozilla suddenly uses 10000ms; 14750ms and 5500ms on the same three tests. ; Im slightly surprised by this sudden large increase in execution time.; ; the test results are very positive compared to IE5.01 though; except for the ; three tests with variables in them. the 4 tests on the left hand side; from top ; to bottom; finish in around 5.5s; 9s; 12.5s and 16s in IE5.01. in other words; ; a nearly linear increase in usage for each string that gets added. the two top ; tests on the right hand side finish in around 9.3s and 20s; a _huge_ difference ; from both Mozilla and Communicator. the last three tests; with variables; ; execute at just about the same speed as Communicator though (the last one ; actually about a second faster).'''],'''Linux/Slackware: undefined iostream symbols; app wont start''':['''HTML: Parser''','''johnny:~/mozilla/package# ./mozilla-apprunner.sh; MOZILLA_FIVE_HOME=/root/mozilla/package; LD_LIBRARY_PATH=/root/mozilla/package:/usr/local/rvplayer5.0; MOZ_PROGRAM=./apprunner; moz_debug=0; moz_debugger=; ./apprunner: error in loading shared libraries; /root/mozilla/package/libraptorhtmlpars.so: undefined symbol:; __vt_8iostream.3ios; ; I am running Slackware 4.0 and never have had any luck running any; of these milestone releases. This was the M7 attempt.; Just thought you should know.; Thanks; Johnny O''']}
soft_prompt = {i["title"]: [i["component"],i["description"]] for i in similar_docs if i["similarity"] > 0} if similar_docs else None
if soft_prompt is None:
soft_prompt = {}
for k, v in examples.items():
if len(soft_prompt) >= 3:
break
if k not in soft_prompt:
soft_prompt[k] = v
generated_title = generate_title(soft_prompt, new_doc_text)
return {
"title": f"{generated_title}",
"embedding": embedding
}
def infer(component,title="",description="",pid=None,mode="priority"):
if mode=="title":
return suggest_title(component, description,pid)
# BERT embedding
combined_text = f"{component} [SEP] {title} [SEP] {description}"
inputs = tokenizer(combined_text, return_tensors="pt", truncation=True,max_length=512)
with torch.no_grad():
outputs = model(**inputs)
cls_embedding = outputs.hidden_states[-1][:, 0, :].numpy()
test_df = pd.DataFrame(cls_embedding)
# Preprocessing
test_pca = scaler.transform(test_df)
test_pca = pca.transform(test_pca)
test_df = pd.DataFrame(test_pca, columns=[f"PCA{i+1}" for i in range(75)], index=test_df.index)
kmeans_labels_test = kmean.predict(test_df)
test_df["kmeans_labels"]=kmeans_labels_test
test_df["kmeans_labels"] = pd.Categorical(test_df["kmeans_labels"], categories=category_mappings["kmeans_labels"])
# Predict
prediction = xgb_model.predict(test_df,iteration_range=(0, 130))
return f"{int(prediction[0])}"
# Gradio interface
iface = gr.Interface(
fn=infer,
inputs=[
gr.Textbox(label="Component"),
gr.Textbox(label="Title"),
gr.Textbox(label="Description"),
gr.Textbox(label="PID (optional)", value="default_pid"), # Or Hidden
gr.Radio(["priority", "title"], label="Mode", value="priority")
],
outputs="text"
)
iface.launch() |