celalkartoglu commited on
Commit
a78d6c7
·
verified ·
1 Parent(s): 77f41bd

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +7 -11
  3. app.py +68 -0
  4. e5_index.faiss +3 -0
  5. requirements.txt +6 -0
  6. texts.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ e5_index.faiss filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,8 @@
1
- ---
2
- title: Turkce Rag Qa Space
3
- emoji: 👀
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.45.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
1
+ # Türkçe RAG QA (e5 + BERT-SQuAD)
 
 
 
 
 
 
 
 
 
2
 
3
+ - Retriever: `intfloat/multilingual-e5-base`
4
+ - Reader: `savasy/bert-base-turkish-squad`
5
+ - İndeks: FAISS (`e5_index.faiss`), metin parçaları: `texts.json`
6
+ - Basit Gradio arayüzü ile soru-cevap.
7
+
8
+ **Lisans:** Kullandığınız veri ve pasajların lisansından siz sorumlusunuz.
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json, faiss, os, re
2
+ import numpy as np
3
+ import gradio as gr
4
+ from sentence_transformers import SentenceTransformer
5
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
6
+
7
+ # Artifacts
8
+ INDEX_PATH = "e5_index.faiss"
9
+ TEXTS_PATH = "texts.json"
10
+
11
+ # Modeller (HF'den indirilir)
12
+ EMB_NAME = "intfloat/multilingual-e5-base"
13
+ READER_NAME = "savasy/bert-base-turkish-squad"
14
+
15
+ # Yükleme
16
+ index = faiss.read_index(INDEX_PATH)
17
+ with open(TEXTS_PATH, "r", encoding="utf-8") as f:
18
+ texts = json.load(f)
19
+
20
+ embedder = SentenceTransformer(EMB_NAME)
21
+ qa_tok = AutoTokenizer.from_pretrained(READER_NAME)
22
+ qa_mod = AutoModelForQuestionAnswering.from_pretrained(READER_NAME)
23
+ qa = pipeline("question-answering", model=qa_mod, tokenizer=qa_tok, device_map="auto")
24
+
25
+ def search_semantic(q, k=80):
26
+ qv = embedder.encode([f"query: {q}"], convert_to_numpy=True, normalize_embeddings=True)
27
+ D, I = index.search(qv.astype(np.float32), k)
28
+ return I[0].tolist()
29
+
30
+ def finalize_answer(raw_answer, context, max_chars=220):
31
+ ans = (raw_answer or "").strip()
32
+ if not ans: return "Bilmiyorum"
33
+ sents = re.split(r"(?<=[.!?])\s+", context)
34
+ hit = next((s for s in sents if ans in s), None)
35
+ text = (hit or ans).strip()
36
+ text = re.sub(r"\s+", " ", text).strip()
37
+ if len(text) > max_chars:
38
+ text = text[:max_chars].rsplit(" ", 1)[0].rstrip() + "…"
39
+ if not re.search(r"[.!?…]$", text): text += "."
40
+ return text[0].upper() + text[1:]
41
+
42
+ def answer(q, k=80, top_m=12, min_conf=0.10):
43
+ I = search_semantic(q, k=k)
44
+ best = {"answer":"Bilmiyorum","score":0.0,"context":""}
45
+ for j in I[:top_m]:
46
+ ctx = texts[j]
47
+ out = qa({"question": q, "context": ctx})
48
+ ans, sc = out.get("answer","").strip(), float(out.get("score",0.0))
49
+ if sc > best["score"]:
50
+ best = {"answer": (ans or "Bilmiyorum"), "score": sc, "context": ctx}
51
+ if best["score"] < min_conf or not best["answer"]:
52
+ best["answer"] = "Bilmiyorum"
53
+ pretty = finalize_answer(best["answer"], best["context"])
54
+ return pretty, round(best["score"],3)
55
+
56
+ with gr.Blocks() as demo:
57
+ gr.Markdown("# Türkçe RAG QA (e5 + BERT-SQuAD)")
58
+ inp = gr.Textbox(label="Sorunuzu yazın", placeholder="Verem nedir?")
59
+ with gr.Row():
60
+ k = gr.Slider(20, 120, value=80, step=10, label="k (retrieval)")
61
+ m = gr.Slider(4, 24, value=12, step=1, label="top_m_for_qa")
62
+ th = gr.Slider(0.05, 0.35, value=0.10, step=0.01, label="min_conf")
63
+ out_ans = gr.Textbox(label="Cevap")
64
+ out_sc = gr.Number(label="Güven", precision=3)
65
+ btn = gr.Button("Sor")
66
+ btn.click(answer, inputs=[inp,k,m,th], outputs=[out_ans, out_sc])
67
+
68
+ demo.launch()
e5_index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f574fbbe622591dd81c9677b3e1a3505e38c50b8c0cd9595046bff2d652eb826
3
+ size 105907245
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ sentence-transformers==2.2.2
2
+ faiss-cpu
3
+ transformers
4
+ accelerate
5
+ torch
6
+ gradio
texts.json ADDED
The diff for this file is too large to render. See raw diff