Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- README.md +7 -11
- app.py +68 -0
- e5_index.faiss +3 -0
- requirements.txt +6 -0
- texts.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
e5_index.faiss filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,12 +1,8 @@
|
|
| 1 |
-
-
|
| 2 |
-
title: Turkce Rag Qa Space
|
| 3 |
-
emoji: 👀
|
| 4 |
-
colorFrom: indigo
|
| 5 |
-
colorTo: purple
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 5.45.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Türkçe RAG QA (e5 + BERT-SQuAD)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
- Retriever: `intfloat/multilingual-e5-base`
|
| 4 |
+
- Reader: `savasy/bert-base-turkish-squad`
|
| 5 |
+
- İndeks: FAISS (`e5_index.faiss`), metin parçaları: `texts.json`
|
| 6 |
+
- Basit Gradio arayüzü ile soru-cevap.
|
| 7 |
+
|
| 8 |
+
**Lisans:** Kullandığınız veri ve pasajların lisansından siz sorumlusunuz.
|
app.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json, faiss, os, re
|
| 2 |
+
import numpy as np
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
|
| 6 |
+
|
| 7 |
+
# Artifacts
|
| 8 |
+
INDEX_PATH = "e5_index.faiss"
|
| 9 |
+
TEXTS_PATH = "texts.json"
|
| 10 |
+
|
| 11 |
+
# Modeller (HF'den indirilir)
|
| 12 |
+
EMB_NAME = "intfloat/multilingual-e5-base"
|
| 13 |
+
READER_NAME = "savasy/bert-base-turkish-squad"
|
| 14 |
+
|
| 15 |
+
# Yükleme
|
| 16 |
+
index = faiss.read_index(INDEX_PATH)
|
| 17 |
+
with open(TEXTS_PATH, "r", encoding="utf-8") as f:
|
| 18 |
+
texts = json.load(f)
|
| 19 |
+
|
| 20 |
+
embedder = SentenceTransformer(EMB_NAME)
|
| 21 |
+
qa_tok = AutoTokenizer.from_pretrained(READER_NAME)
|
| 22 |
+
qa_mod = AutoModelForQuestionAnswering.from_pretrained(READER_NAME)
|
| 23 |
+
qa = pipeline("question-answering", model=qa_mod, tokenizer=qa_tok, device_map="auto")
|
| 24 |
+
|
| 25 |
+
def search_semantic(q, k=80):
|
| 26 |
+
qv = embedder.encode([f"query: {q}"], convert_to_numpy=True, normalize_embeddings=True)
|
| 27 |
+
D, I = index.search(qv.astype(np.float32), k)
|
| 28 |
+
return I[0].tolist()
|
| 29 |
+
|
| 30 |
+
def finalize_answer(raw_answer, context, max_chars=220):
|
| 31 |
+
ans = (raw_answer or "").strip()
|
| 32 |
+
if not ans: return "Bilmiyorum"
|
| 33 |
+
sents = re.split(r"(?<=[.!?])\s+", context)
|
| 34 |
+
hit = next((s for s in sents if ans in s), None)
|
| 35 |
+
text = (hit or ans).strip()
|
| 36 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 37 |
+
if len(text) > max_chars:
|
| 38 |
+
text = text[:max_chars].rsplit(" ", 1)[0].rstrip() + "…"
|
| 39 |
+
if not re.search(r"[.!?…]$", text): text += "."
|
| 40 |
+
return text[0].upper() + text[1:]
|
| 41 |
+
|
| 42 |
+
def answer(q, k=80, top_m=12, min_conf=0.10):
|
| 43 |
+
I = search_semantic(q, k=k)
|
| 44 |
+
best = {"answer":"Bilmiyorum","score":0.0,"context":""}
|
| 45 |
+
for j in I[:top_m]:
|
| 46 |
+
ctx = texts[j]
|
| 47 |
+
out = qa({"question": q, "context": ctx})
|
| 48 |
+
ans, sc = out.get("answer","").strip(), float(out.get("score",0.0))
|
| 49 |
+
if sc > best["score"]:
|
| 50 |
+
best = {"answer": (ans or "Bilmiyorum"), "score": sc, "context": ctx}
|
| 51 |
+
if best["score"] < min_conf or not best["answer"]:
|
| 52 |
+
best["answer"] = "Bilmiyorum"
|
| 53 |
+
pretty = finalize_answer(best["answer"], best["context"])
|
| 54 |
+
return pretty, round(best["score"],3)
|
| 55 |
+
|
| 56 |
+
with gr.Blocks() as demo:
|
| 57 |
+
gr.Markdown("# Türkçe RAG QA (e5 + BERT-SQuAD)")
|
| 58 |
+
inp = gr.Textbox(label="Sorunuzu yazın", placeholder="Verem nedir?")
|
| 59 |
+
with gr.Row():
|
| 60 |
+
k = gr.Slider(20, 120, value=80, step=10, label="k (retrieval)")
|
| 61 |
+
m = gr.Slider(4, 24, value=12, step=1, label="top_m_for_qa")
|
| 62 |
+
th = gr.Slider(0.05, 0.35, value=0.10, step=0.01, label="min_conf")
|
| 63 |
+
out_ans = gr.Textbox(label="Cevap")
|
| 64 |
+
out_sc = gr.Number(label="Güven", precision=3)
|
| 65 |
+
btn = gr.Button("Sor")
|
| 66 |
+
btn.click(answer, inputs=[inp,k,m,th], outputs=[out_ans, out_sc])
|
| 67 |
+
|
| 68 |
+
demo.launch()
|
e5_index.faiss
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f574fbbe622591dd81c9677b3e1a3505e38c50b8c0cd9595046bff2d652eb826
|
| 3 |
+
size 105907245
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sentence-transformers==2.2.2
|
| 2 |
+
faiss-cpu
|
| 3 |
+
transformers
|
| 4 |
+
accelerate
|
| 5 |
+
torch
|
| 6 |
+
gradio
|
texts.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|