Spaces:
Running
Running
add fine-tuned
Browse files
app.py
CHANGED
|
@@ -1,18 +1,26 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from sentence_transformers import SentenceTransformer
|
| 3 |
from supabase import create_client
|
| 4 |
import os
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
|
| 7 |
load_dotenv()
|
| 8 |
|
| 9 |
-
|
|
|
|
| 10 |
|
| 11 |
# === Supabase ===
|
| 12 |
supabase_url = os.getenv("SUPABASE_URL")
|
| 13 |
supabase_key = os.getenv("SUPABASE_KEY")
|
| 14 |
supabase = create_client(supabase_url, supabase_key)
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
def get_embedding(text: str):
|
| 17 |
"""
|
| 18 |
Menghasilkan embedding vector dari teks menggunakan model SentenceTransformer.
|
|
@@ -20,9 +28,32 @@ def get_embedding(text: str):
|
|
| 20 |
if not text:
|
| 21 |
return []
|
| 22 |
|
| 23 |
-
|
|
|
|
| 24 |
return embedding
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
def search_kbli(text: str):
|
| 27 |
if not text:
|
| 28 |
return {"embedding": [], "results": []}
|
|
@@ -156,19 +187,35 @@ with gr.Blocks(css="""
|
|
| 156 |
inp1.submit(get_embedding, inp1, out1, api_name="get_embedding")
|
| 157 |
btn_clear1.click(lambda: ("", None), None, [inp1, out1])
|
| 158 |
btn_submit1.click(get_embedding, inp1, out1, api_name="get_embedding")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
with gr.Tab("Search KBLI"):
|
| 161 |
-
|
| 162 |
|
| 163 |
with gr.Row(elem_classes="btn-row-search"):
|
| 164 |
-
|
| 165 |
-
|
| 166 |
|
| 167 |
-
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
|
| 173 |
if __name__ == "__main__":
|
| 174 |
demo.queue().launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 3 |
from supabase import create_client
|
| 4 |
import os
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
|
| 7 |
load_dotenv()
|
| 8 |
|
| 9 |
+
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
| 10 |
+
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
| 11 |
|
| 12 |
# === Supabase ===
|
| 13 |
supabase_url = os.getenv("SUPABASE_URL")
|
| 14 |
supabase_key = os.getenv("SUPABASE_KEY")
|
| 15 |
supabase = create_client(supabase_url, supabase_key)
|
| 16 |
|
| 17 |
+
def expand_query(query: str) -> str:
|
| 18 |
+
"""
|
| 19 |
+
Fungsi kecil untuk memperluas query.
|
| 20 |
+
Bisa ditambah sinonim, kata kunci, atau parafrasa.
|
| 21 |
+
"""
|
| 22 |
+
return f"Kegiatan usaha yang berkaitan dengan {query}"
|
| 23 |
+
|
| 24 |
def get_embedding(text: str):
|
| 25 |
"""
|
| 26 |
Menghasilkan embedding vector dari teks menggunakan model SentenceTransformer.
|
|
|
|
| 28 |
if not text:
|
| 29 |
return []
|
| 30 |
|
| 31 |
+
expanded_text = expand_query(text)
|
| 32 |
+
embedding = embedder.encode(expanded_text).tolist()
|
| 33 |
return embedding
|
| 34 |
|
| 35 |
+
def fn_semantic(query: str, match_count: int = 25):
|
| 36 |
+
embedding = embedder.encode(query).tolist()
|
| 37 |
+
|
| 38 |
+
response = supabase.rpc(
|
| 39 |
+
"search_kbli",
|
| 40 |
+
{"query_embedding": embedding, "match_count": match_count}
|
| 41 |
+
).execute()
|
| 42 |
+
candidates = response.data or []
|
| 43 |
+
|
| 44 |
+
if not candidates:
|
| 45 |
+
return {"results": []}
|
| 46 |
+
|
| 47 |
+
pairs = [(query, c["judul"] + " " + c["deskripsi"]) for c in candidates]
|
| 48 |
+
scores = reranker.predict(pairs)
|
| 49 |
+
|
| 50 |
+
for c, s in zip(candidates, scores):
|
| 51 |
+
c["rerank_score"] = float(s)
|
| 52 |
+
|
| 53 |
+
candidates = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)
|
| 54 |
+
|
| 55 |
+
return {"results": candidates[:10]}
|
| 56 |
+
|
| 57 |
def search_kbli(text: str):
|
| 58 |
if not text:
|
| 59 |
return {"embedding": [], "results": []}
|
|
|
|
| 187 |
inp1.submit(get_embedding, inp1, out1, api_name="get_embedding")
|
| 188 |
btn_clear1.click(lambda: ("", None), None, [inp1, out1])
|
| 189 |
btn_submit1.click(get_embedding, inp1, out1, api_name="get_embedding")
|
| 190 |
+
|
| 191 |
+
with gr.Tab("Embedding Fine-tuned"):
|
| 192 |
+
with gr.Row():
|
| 193 |
+
with gr.Column(scale=1):
|
| 194 |
+
inp2 = gr.Textbox(label="Masukkan teks")
|
| 195 |
+
|
| 196 |
+
with gr.Row(elem_classes="btn-row"):
|
| 197 |
+
btn_clear2 = gr.Button("Clear", variant="secondary")
|
| 198 |
+
btn_submit2 = gr.Button("Submit", variant="primary")
|
| 199 |
+
|
| 200 |
+
with gr.Column(scale=1):
|
| 201 |
+
out2 = gr.JSON(label="Embedding Vector")
|
| 202 |
+
|
| 203 |
+
inp2.submit(fn_semantic, inp2, out2, api_name="fn_semantic")
|
| 204 |
+
btn_clear2.click(lambda: ("", None), None, [inp2, out2])
|
| 205 |
+
btn_submit2.click(fn_semantic, inp2, out2, api_name="fn_semantic")
|
| 206 |
|
| 207 |
with gr.Tab("Search KBLI"):
|
| 208 |
+
inp3 = gr.Textbox(label="Masukkan teks")
|
| 209 |
|
| 210 |
with gr.Row(elem_classes="btn-row-search"):
|
| 211 |
+
btn_clear3 = gr.Button("Clear", variant="secondary")
|
| 212 |
+
btn_submit3 = gr.Button("Submit", variant="primary")
|
| 213 |
|
| 214 |
+
out3 = gr.HTML(label="Hasil Pencarian Semantic")
|
| 215 |
|
| 216 |
+
inp3.submit(search_kbli, inp3, out3, api_name="search_kbli")
|
| 217 |
+
btn_clear3.click(lambda: ("", None), None, [inp3, out3])
|
| 218 |
+
btn_submit3.click(search_kbli, inp3, out3, api_name="search_kbli")
|
| 219 |
|
| 220 |
if __name__ == "__main__":
|
| 221 |
demo.queue().launch()
|