celalkartoglu's picture
Update app.py
5957062 verified
import json, faiss, re
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
INDEX_PATH = "e5_index.faiss"
TEXTS_PATH = "texts.json"
EMB_NAME = "intfloat/multilingual-e5-base"
READER_NAME = "savasy/bert-base-turkish-squad"
RERANK_MODEL = "BAAI/bge-reranker-v2-m3"
index = faiss.read_index(INDEX_PATH)
with open(TEXTS_PATH, "r", encoding="utf-8") as f:
texts = json.load(f)
embedder = SentenceTransformer(EMB_NAME)
qa_tok = AutoTokenizer.from_pretrained(READER_NAME)
qa_mod = AutoModelForQuestionAnswering.from_pretrained(READER_NAME)
qa = pipeline("question-answering", model=qa_mod, tokenizer=qa_tok, device_map="auto")
reranker = CrossEncoder(RERANK_MODEL)
def search_semantic(q, k=80):
qv = embedder.encode([f"query: {q}"], convert_to_numpy=True, normalize_embeddings=True)
D, I = index.search(qv.astype(np.float32), k)
return I[0].tolist()
def finalize_answer(raw_answer, context, max_chars=220):
ans = (raw_answer or "").strip()
if not ans:
return "Bilmiyorum"
sents = re.split(r"(?<=[.!?])\s+", context)
hit = next((s for s in sents if ans in s), None)
text = (hit or ans).strip()
text = re.sub(r"\s+", " ", text).strip()
if len(text) > max_chars:
text = text[:max_chars].rsplit(" ", 1)[0].rstrip() + "…"
if not re.search(r"[.!?…]$", text):
text += "."
return text[0].upper() + text[1:]
# ---- Cevaplayıcılar (kaynak döndürmez) ----
def answer_rerank(q, k=80, top_m=12, min_conf=0.10):
I = search_semantic(q, k=k)
cand = [texts[i] for i in I]
scores = reranker.predict([[q, c] for c in cand])
reranked = [x for _, x in sorted(zip(scores, cand), key=lambda z: z[0], reverse=True)]
best = {"answer": "Bilmiyorum", "score": 0.0, "context": ""}
for ctx in reranked[:top_m]:
out = qa({"question": q, "context": ctx})
ans, sc = out.get("answer", "").strip(), float(out.get("score", 0.0))
if sc > best["score"]:
best = {"answer": (ans or "Bilmiyorum"), "score": sc, "context": ctx}
if best["score"] < float(min_conf) or not best["answer"]:
best["answer"] = "Bilmiyorum"
pretty = finalize_answer(best["answer"], best["context"])
return pretty, round(best["score"], 3)
def answer_simple(q, k=80, top_m=12, min_conf=0.10):
I = search_semantic(q, k=k)
best = {"answer": "Bilmiyorum", "score": 0.0, "context": ""}
for j in I[:top_m]:
ctx = texts[j]
out = qa({"question": q, "context": ctx})
ans, sc = out.get("answer", "").strip(), float(out.get("score", 0.0))
if sc > best["score"]:
best = {"answer": (ans or "Bilmiyorum"), "score": sc, "context": ctx}
if best["score"] < float(min_conf) or not best["answer"]:
best["answer"] = "Bilmiyorum"
pretty = finalize_answer(best["answer"], best["context"])
return pretty, round(best["score"], 3)
# ---- Gradio UI (kaynak çıktısı ve ipucu yok) ----
with gr.Blocks() as demo:
gr.Markdown("# Türkçe RAG QA (e5 + BERT-SQuAD)")
inp = gr.Textbox(label="Sorunuzu yazın", placeholder="Verem nedir?")
with gr.Row():
k = gr.Slider(20, 120, value=80, step=10, label="k (retrieval)")
m = gr.Slider(4, 24, value=12, step=1, label="top_m_for_qa")
th = gr.Slider(0.05, 0.35, value=0.10, step=0.01, label="min_conf")
use_rerank = gr.Checkbox(value=True, label="Reranker kullan (BGE v2 m3)")
out_ans = gr.Textbox(label="Cevap")
out_sc = gr.Number(label="Güven", precision=3)
btn = gr.Button("Sor")
def route(q, kk, mm, tt, rr):
if rr:
return answer_rerank(q, kk, mm, tt)
else:
return answer_simple(q, kk, mm, tt)
btn.click(route, inputs=[inp, k, m, th, use_rerank], outputs=[out_ans, out_sc])
demo.queue().launch()