allbibek commited on
Commit
80ec8c9
·
verified ·
1 Parent(s): e3e1895

add fine-tuned

Browse files
Files changed (1) hide show
  1. app.py +57 -10
app.py CHANGED
@@ -1,18 +1,26 @@
1
  import gradio as gr
2
- from sentence_transformers import SentenceTransformer
3
  from supabase import create_client
4
  import os
5
  from dotenv import load_dotenv
6
 
7
  load_dotenv()
8
 
9
- model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
 
10
 
11
  # === Supabase ===
12
  supabase_url = os.getenv("SUPABASE_URL")
13
  supabase_key = os.getenv("SUPABASE_KEY")
14
  supabase = create_client(supabase_url, supabase_key)
15
 
 
 
 
 
 
 
 
16
  def get_embedding(text: str):
17
  """
18
  Menghasilkan embedding vector dari teks menggunakan model SentenceTransformer.
@@ -20,9 +28,32 @@ def get_embedding(text: str):
20
  if not text:
21
  return []
22
 
23
- embedding = model.encode(text).tolist()
 
24
  return embedding
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def search_kbli(text: str):
27
  if not text:
28
  return {"embedding": [], "results": []}
@@ -156,19 +187,35 @@ with gr.Blocks(css="""
156
  inp1.submit(get_embedding, inp1, out1, api_name="get_embedding")
157
  btn_clear1.click(lambda: ("", None), None, [inp1, out1])
158
  btn_submit1.click(get_embedding, inp1, out1, api_name="get_embedding")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  with gr.Tab("Search KBLI"):
161
- inp2 = gr.Textbox(label="Masukkan teks")
162
 
163
  with gr.Row(elem_classes="btn-row-search"):
164
- btn_clear2 = gr.Button("Clear", variant="secondary")
165
- btn_submit2 = gr.Button("Submit", variant="primary")
166
 
167
- out2 = gr.HTML(label="Hasil Pencarian Semantic")
168
 
169
- inp2.submit(search_kbli, inp2, out2, api_name="search_kbli")
170
- btn_clear2.click(lambda: ("", None), None, [inp2, out2])
171
- btn_submit2.click(search_kbli, inp2, out2, api_name="search_kbli")
172
 
173
  if __name__ == "__main__":
174
  demo.queue().launch()
 
1
  import gradio as gr
2
+ from sentence_transformers import SentenceTransformer, CrossEncoder
3
  from supabase import create_client
4
  import os
5
  from dotenv import load_dotenv
6
 
7
  load_dotenv()
8
 
9
+ embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
10
+ reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
11
 
12
  # === Supabase ===
13
  supabase_url = os.getenv("SUPABASE_URL")
14
  supabase_key = os.getenv("SUPABASE_KEY")
15
  supabase = create_client(supabase_url, supabase_key)
16
 
17
+ def expand_query(query: str) -> str:
18
+ """
19
+ Fungsi kecil untuk memperluas query.
20
+ Bisa ditambah sinonim, kata kunci, atau parafrasa.
21
+ """
22
+ return f"Kegiatan usaha yang berkaitan dengan {query}"
23
+
24
  def get_embedding(text: str):
25
  """
26
  Menghasilkan embedding vector dari teks menggunakan model SentenceTransformer.
 
28
  if not text:
29
  return []
30
 
31
+ expanded_text = expand_query(text)
32
+ embedding = embedder.encode(expanded_text).tolist()
33
  return embedding
34
 
35
+ def fn_semantic(query: str, match_count: int = 25):
36
+ embedding = embedder.encode(query).tolist()
37
+
38
+ response = supabase.rpc(
39
+ "search_kbli",
40
+ {"query_embedding": embedding, "match_count": match_count}
41
+ ).execute()
42
+ candidates = response.data or []
43
+
44
+ if not candidates:
45
+ return {"results": []}
46
+
47
+ pairs = [(query, c["judul"] + " " + c["deskripsi"]) for c in candidates]
48
+ scores = reranker.predict(pairs)
49
+
50
+ for c, s in zip(candidates, scores):
51
+ c["rerank_score"] = float(s)
52
+
53
+ candidates = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)
54
+
55
+ return {"results": candidates[:10]}
56
+
57
  def search_kbli(text: str):
58
  if not text:
59
  return {"embedding": [], "results": []}
 
187
  inp1.submit(get_embedding, inp1, out1, api_name="get_embedding")
188
  btn_clear1.click(lambda: ("", None), None, [inp1, out1])
189
  btn_submit1.click(get_embedding, inp1, out1, api_name="get_embedding")
190
+
191
+ with gr.Tab("Embedding Fine-tuned"):
192
+ with gr.Row():
193
+ with gr.Column(scale=1):
194
+ inp2 = gr.Textbox(label="Masukkan teks")
195
+
196
+ with gr.Row(elem_classes="btn-row"):
197
+ btn_clear2 = gr.Button("Clear", variant="secondary")
198
+ btn_submit2 = gr.Button("Submit", variant="primary")
199
+
200
+ with gr.Column(scale=1):
201
+ out2 = gr.JSON(label="Embedding Vector")
202
+
203
+ inp2.submit(fn_semantic, inp2, out2, api_name="fn_semantic")
204
+ btn_clear2.click(lambda: ("", None), None, [inp2, out2])
205
+ btn_submit2.click(fn_semantic, inp2, out2, api_name="fn_semantic")
206
 
207
  with gr.Tab("Search KBLI"):
208
+ inp3 = gr.Textbox(label="Masukkan teks")
209
 
210
  with gr.Row(elem_classes="btn-row-search"):
211
+ btn_clear3 = gr.Button("Clear", variant="secondary")
212
+ btn_submit3 = gr.Button("Submit", variant="primary")
213
 
214
+ out3 = gr.HTML(label="Hasil Pencarian Semantic")
215
 
216
+ inp3.submit(search_kbli, inp3, out3, api_name="search_kbli")
217
+ btn_clear3.click(lambda: ("", None), None, [inp3, out3])
218
+ btn_submit3.click(search_kbli, inp3, out3, api_name="search_kbli")
219
 
220
  if __name__ == "__main__":
221
  demo.queue().launch()