Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,26 +8,25 @@ from sentence_transformers import SentenceTransformer
|
|
| 8 |
from dspy import Example, MIPROv2, Evaluate, evaluate
|
| 9 |
from dspy import LiteLLM
|
| 10 |
|
| 11 |
-
# تحميل التوكن من
|
| 12 |
-
HF_TOKEN = os.environ
|
| 13 |
|
| 14 |
-
#
|
| 15 |
dspy.settings.configure(
|
| 16 |
lm=LiteLLM(
|
| 17 |
-
model="HuggingFaceH4/zephyr-7b-beta", #
|
| 18 |
api_base="https://api-inference.huggingface.co/v1",
|
| 19 |
api_key=HF_TOKEN
|
| 20 |
)
|
| 21 |
)
|
| 22 |
|
| 23 |
-
# إعداد
|
| 24 |
client = chromadb.PersistentClient(path="./chroma_db")
|
| 25 |
col = client.get_or_create_collection(name="arabic_docs")
|
| 26 |
|
| 27 |
-
# نموذج
|
| 28 |
embedder = SentenceTransformer("sentence-transformers/LaBSE")
|
| 29 |
|
| 30 |
-
# استخراج النصوص من PDF
|
| 31 |
def process_pdf(pdf_bytes):
|
| 32 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 33 |
texts = []
|
|
@@ -38,7 +37,6 @@ def process_pdf(pdf_bytes):
|
|
| 38 |
texts.append(chunk.strip())
|
| 39 |
return texts
|
| 40 |
|
| 41 |
-
# إدخال النصوص إلى قاعدة Chroma
|
| 42 |
def ingest(pdf_file):
|
| 43 |
pdf_bytes = pdf_file
|
| 44 |
texts = process_pdf(pdf_bytes)
|
|
@@ -47,20 +45,17 @@ def ingest(pdf_file):
|
|
| 47 |
col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
|
| 48 |
return f"✅ تمت إضافة {len(texts)} مقطعاً."
|
| 49 |
|
| 50 |
-
# استرجاع السياق الأقرب للسؤال
|
| 51 |
def retrieve_context(question):
|
| 52 |
embedding = embedder.encode([question])[0]
|
| 53 |
results = col.query(query_embeddings=[embedding.tolist()], n_results=3)
|
| 54 |
context_list = [m["text"] for m in results["metadatas"][0]]
|
| 55 |
return "\n\n".join(context_list)
|
| 56 |
|
| 57 |
-
# تعريف توقيع وحدة RAG
|
| 58 |
class RagSig(dspy.Signature):
|
| 59 |
question: str = dspy.InputField()
|
| 60 |
context: str = dspy.InputField()
|
| 61 |
answer: str = dspy.OutputField()
|
| 62 |
|
| 63 |
-
# وحدة RAG
|
| 64 |
class RagMod(dspy.Module):
|
| 65 |
def __init__(self):
|
| 66 |
super().__init__()
|
|
@@ -72,17 +67,14 @@ class RagMod(dspy.Module):
|
|
| 72 |
|
| 73 |
model = RagMod()
|
| 74 |
|
| 75 |
-
# توليد الإجابة
|
| 76 |
def answer(question):
|
| 77 |
out = model(question)
|
| 78 |
return out.answer
|
| 79 |
|
| 80 |
-
# تحميل مجموعة بيانات التدريب
|
| 81 |
def load_dataset(path):
|
| 82 |
with open(path, "r", encoding="utf-8") as f:
|
| 83 |
return [Example(**json.loads(l)).with_inputs("question") for l in f]
|
| 84 |
|
| 85 |
-
# تحسين النموذج
|
| 86 |
def optimize(train_file, val_file):
|
| 87 |
global model
|
| 88 |
trainset = load_dataset(train_file.name)
|
|
@@ -92,9 +84,8 @@ def optimize(train_file, val_file):
|
|
| 92 |
model = optimized
|
| 93 |
return "✅ تم تحسين النموذج!"
|
| 94 |
|
| 95 |
-
# واجهة Gradio
|
| 96 |
with gr.Blocks() as demo:
|
| 97 |
-
gr.Markdown("## 🧠 نظام RAG عربي باستخدام DSPy + ChromaDB +
|
| 98 |
|
| 99 |
with gr.Tab("📥 تحميل وتخزين"):
|
| 100 |
pdf_input = gr.File(label="ارفع ملف PDF", type="binary")
|
|
|
|
| 8 |
from dspy import Example, MIPROv2, Evaluate, evaluate
|
| 9 |
from dspy import LiteLLM
|
| 10 |
|
| 11 |
+
# تحميل التوكن من Secrets
|
| 12 |
+
HF_TOKEN = os.environ["HF_TOKEN"]
|
| 13 |
|
| 14 |
+
# تهيئة النموذج عبر LiteLLM من Hugging Face API
|
| 15 |
dspy.settings.configure(
|
| 16 |
lm=LiteLLM(
|
| 17 |
+
model="HuggingFaceH4/zephyr-7b-beta", # اختر نموذج Instruct مدعوم
|
| 18 |
api_base="https://api-inference.huggingface.co/v1",
|
| 19 |
api_key=HF_TOKEN
|
| 20 |
)
|
| 21 |
)
|
| 22 |
|
| 23 |
+
# إعداد قاعدة بيانات Chroma
|
| 24 |
client = chromadb.PersistentClient(path="./chroma_db")
|
| 25 |
col = client.get_or_create_collection(name="arabic_docs")
|
| 26 |
|
| 27 |
+
# إعداد نموذج LaBSE للتضمين العربي
|
| 28 |
embedder = SentenceTransformer("sentence-transformers/LaBSE")
|
| 29 |
|
|
|
|
| 30 |
def process_pdf(pdf_bytes):
|
| 31 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 32 |
texts = []
|
|
|
|
| 37 |
texts.append(chunk.strip())
|
| 38 |
return texts
|
| 39 |
|
|
|
|
| 40 |
def ingest(pdf_file):
|
| 41 |
pdf_bytes = pdf_file
|
| 42 |
texts = process_pdf(pdf_bytes)
|
|
|
|
| 45 |
col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
|
| 46 |
return f"✅ تمت إضافة {len(texts)} مقطعاً."
|
| 47 |
|
|
|
|
| 48 |
def retrieve_context(question):
|
| 49 |
embedding = embedder.encode([question])[0]
|
| 50 |
results = col.query(query_embeddings=[embedding.tolist()], n_results=3)
|
| 51 |
context_list = [m["text"] for m in results["metadatas"][0]]
|
| 52 |
return "\n\n".join(context_list)
|
| 53 |
|
|
|
|
| 54 |
class RagSig(dspy.Signature):
|
| 55 |
question: str = dspy.InputField()
|
| 56 |
context: str = dspy.InputField()
|
| 57 |
answer: str = dspy.OutputField()
|
| 58 |
|
|
|
|
| 59 |
class RagMod(dspy.Module):
|
| 60 |
def __init__(self):
|
| 61 |
super().__init__()
|
|
|
|
| 67 |
|
| 68 |
model = RagMod()
|
| 69 |
|
|
|
|
| 70 |
def answer(question):
|
| 71 |
out = model(question)
|
| 72 |
return out.answer
|
| 73 |
|
|
|
|
| 74 |
def load_dataset(path):
|
| 75 |
with open(path, "r", encoding="utf-8") as f:
|
| 76 |
return [Example(**json.loads(l)).with_inputs("question") for l in f]
|
| 77 |
|
|
|
|
| 78 |
def optimize(train_file, val_file):
|
| 79 |
global model
|
| 80 |
trainset = load_dataset(train_file.name)
|
|
|
|
| 84 |
model = optimized
|
| 85 |
return "✅ تم تحسين النموذج!"
|
| 86 |
|
|
|
|
| 87 |
with gr.Blocks() as demo:
|
| 88 |
+
gr.Markdown("## 🧠 نظام RAG عربي باستخدام DSPy + ChromaDB + Hugging Face Inference")
|
| 89 |
|
| 90 |
with gr.Tab("📥 تحميل وتخزين"):
|
| 91 |
pdf_input = gr.File(label="ارفع ملف PDF", type="binary")
|