infinityy commited on
Commit
7099dd8
·
verified ·
1 Parent(s): e88c53c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -3
app.py CHANGED
@@ -5,6 +5,18 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
  import torch
6
  import numpy as np
7
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # Load BERT model & tokenizer
10
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
@@ -29,7 +41,67 @@ category_mappings = {
29
  "kmeans_labels": pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int32')
30
  }
31
 
32
- def infer(component,title,description):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # BERT embedding
34
  combined_text = f"{component} [SEP] {title} [SEP] {description}"
35
  inputs = tokenizer(combined_text, return_tensors="pt", truncation=True,max_length=512)
@@ -57,8 +129,10 @@ iface = gr.Interface(
57
  inputs=[
58
  gr.Textbox(label="Component"),
59
  gr.Textbox(label="Title"),
60
- gr.Textbox(label="Description")
 
 
61
  ],
62
  outputs="text"
63
  )
64
- iface.launch()
 
5
  import torch
6
  import numpy as np
7
  import pandas as pd
8
+ from supabase import create_client, Client
9
+ from google import genai
10
+ import os
11
+
12
+
13
+ apikey = os.getenv('gemini_secret')
14
+ client = genai.Client(api_key=apikey)
15
+
16
+ supabase_url = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
17
+ supabase_anon_key = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
18
+
19
+ supabase: Client = create_client(supabase_url, supabase_anon_key)
20
 
21
  # Load BERT model & tokenizer
22
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
 
41
  "kmeans_labels": pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int32')
42
  }
43
 
44
+
45
+ def combine_document_features(component, description):
46
+ return f"Component is {component} with Description {description}"
47
+
48
+
49
+ def retrieve_similar_documents(component, description,pid):
50
+ new_doc_text = combine_document_features(component, description)
51
+ inputs = tokenizer(new_doc_text, return_tensors="pt", truncation=True,max_length=512)
52
+ with torch.no_grad():
53
+ outputs = model(**inputs)
54
+ new_embedding = outputs.hidden_states[-1][:, 0, :][0].tolist()
55
+
56
+ # Get all documents from the "bugs" table
57
+ response = supabase.rpc("get_similar_bugs", {
58
+ "query_embedding": new_embedding,
59
+ "pid": pid
60
+ }).execute()
61
+
62
+ if not response.data:
63
+ return None, new_doc_text
64
+ outputs=[]
65
+ for i in response.data:
66
+ outputs.append(i)
67
+ return outputs,new_doc_text
68
+
69
+ def generate_title(soft_prompt, new_document):
70
+ soft_prompt_examples = '\n\n'.join(
71
+ f'Component: {value[0]}\n Description: {value[1]}\n title: {key}' for key, value in soft_prompt.items()
72
+ )
73
+
74
+ input_prompt = f'''Generate a Bug report title based on description and Component. Do not generate anything else. Try to keep it in a limit of 30 characters.
75
+ Look at the below examples to understand the task.
76
+ {soft_prompt_examples}
77
+ Now using the examples above, try giving the output for the following input enclosed within the | delimiter.
78
+ | {new_document} |.
79
+ Give the title as instructed above. Give the title only'''
80
+
81
+ response = client.models.generate_content(model="gemini-2.0-flash", contents=input_prompt)
82
+ return response.text
83
+
84
+ def suggest_title(component, description,pid):
85
+ similar_docs, new_doc_text = retrieve_similar_documents(component,description,pid)
86
+
87
+ examples={'''Concatenation of string variables slow compared to strings themselves''':['''JavaScript Engine''','''Ill be uploading a test case with various tests of string concatenation. ; Mozilla (build 2000040308) shows good performance with all the ones that uses ; strings directly; e.g. string1 + string2. its the last three it has ; problems with; they use string variables (e.g. var1 + var 2) in the ; concatenation.; ; try it out for yourselves. all numberical values shown in the form fields is ; the execution time in millseconds. the four tests on the left hand side; and ; the 2 at the top on the right hand side finished in around 1650ms on my P3/450. ; this is just the same speed as Netscape Comm 4.72. On the last three tests on ; the right hand side NC4.72 uses 7000ms; 10000ms and around 4500ms respectively; ; while Mozilla suddenly uses 10000ms; 14750ms and 5500ms on the same three tests. ; Im slightly surprised by this sudden large increase in execution time.; ; the test results are very positive compared to IE5.01 though; except for the ; three tests with variables in them. the 4 tests on the left hand side; from top ; to bottom; finish in around 5.5s; 9s; 12.5s and 16s in IE5.01. in other words; ; a nearly linear increase in usage for each string that gets added. the two top ; tests on the right hand side finish in around 9.3s and 20s; a _huge_ difference ; from both Mozilla and Communicator. the last three tests; with variables; ; execute at just about the same speed as Communicator though (the last one ; actually about a second faster).'''],'''Linux/Slackware: undefined iostream symbols; app wont start''':['''HTML: Parser''','''johnny:~/mozilla/package# ./mozilla-apprunner.sh; MOZILLA_FIVE_HOME=/root/mozilla/package; LD_LIBRARY_PATH=/root/mozilla/package:/usr/local/rvplayer5.0; MOZ_PROGRAM=./apprunner; moz_debug=0; moz_debugger=; ./apprunner: error in loading shared libraries; /root/mozilla/package/libraptorhtmlpars.so: undefined symbol:; __vt_8iostream.3ios; ; I am running Slackware 4.0 and never have had any luck running any; of these milestone releases. This was the M7 attempt.; Just thought you should know.; Thanks; Johnny O''']}
88
+
89
+ soft_prompt = {i["title"]: [i["component"],i["description"]] for i in similar_docs if i["similarity"] > 0} if similar_docs else None
90
+ if soft_prompt is None:
91
+ soft_prompt = {}
92
+
93
+ for k, v in examples.items():
94
+ if len(soft_prompt) >= 3:
95
+ break
96
+ if k not in soft_prompt:
97
+ soft_prompt[k] = v
98
+
99
+ generated_title = generate_title(soft_prompt, new_doc_text)
100
+ return f"{generated_title}"
101
+
102
+ def infer(component,title,description,pid=None,mode="priority"):
103
+ if mode=="title":
104
+ return suggest_title(component, description,pid)
105
  # BERT embedding
106
  combined_text = f"{component} [SEP] {title} [SEP] {description}"
107
  inputs = tokenizer(combined_text, return_tensors="pt", truncation=True,max_length=512)
 
129
  inputs=[
130
  gr.Textbox(label="Component"),
131
  gr.Textbox(label="Title"),
132
+ gr.Textbox(label="Description"),
133
+ gr.Textbox(label="PID (optional)", value="default_pid"), # Or Hidden
134
+ gr.Radio(["priority", "title"], label="Mode", value="priority")
135
  ],
136
  outputs="text"
137
  )
138
+ iface.launch()