MohJaf commited on
Commit
a871eb4
·
verified ·
1 Parent(s): 15cca04

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,3 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.db filter=lfs diff=lfs merge=lfs -text
2
+ qdrant_storage/** filter=lfs diff=lfs merge=lfs -text
3
+ *.sqlite filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,12 +1,22 @@
1
  ---
2
- title: Bayan Search
3
- emoji: 🏃
4
- colorFrom: indigo
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 6.0.2
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Bayan Usuli BERT API
3
+ emoji: 📚
4
+ colorFrom: green
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # Bayan Usuli BERT API
14
+
15
+ Arabic Islamic Jurisprudence (Usul al-Fiqh) Embedding Model API.
16
+
17
+ This space provides an API for the `MohJaf/bayan-usuli-bert` sentence-transformers model.
18
+
19
+ ## Features
20
+
21
+ - Get text embeddings for Arabic jurisprudence texts
22
+ - Compute semantic similarity between texts
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import sqlite3
4
+ import os
5
+ import shutil
6
+ from sentence_transformers import SentenceTransformer
7
+ from qdrant_client import QdrantClient
8
+ from huggingface_hub import hf_hub_download, snapshot_download
9
+
10
+ # Configuration
11
+ MODEL_PATH = "MohJaf/bayan-usuli-bert"
12
+ DATASET_REPO = "MohJaf/bayan-usuli-data"
13
+ QDRANT_PATH = "./qdrant_storage"
14
+ SQLITE_PATH = "./usuli_library.db"
15
+
16
+ def download_data():
17
+ """Download database files from HuggingFace Dataset"""
18
+ print("Downloading data from HuggingFace...")
19
+
20
+ # Download SQLite database
21
+ if not os.path.exists(SQLITE_PATH):
22
+ print("Downloading SQLite database...")
23
+ sqlite_file = hf_hub_download(
24
+ repo_id=DATASET_REPO,
25
+ filename="usuli_library.db",
26
+ repo_type="dataset"
27
+ )
28
+ shutil.copy(sqlite_file, SQLITE_PATH)
29
+ print(f"SQLite database downloaded to {SQLITE_PATH}")
30
+
31
+ # Download Qdrant storage
32
+ if not os.path.exists(QDRANT_PATH):
33
+ print("Downloading Qdrant storage...")
34
+ # Download the entire qdrant_storage folder
35
+ snapshot_path = snapshot_download(
36
+ repo_id=DATASET_REPO,
37
+ repo_type="dataset",
38
+ allow_patterns=["qdrant_storage/*"]
39
+ )
40
+ # Copy to local path
41
+ src_qdrant = os.path.join(snapshot_path, "qdrant_storage")
42
+ if os.path.exists(src_qdrant):
43
+ shutil.copytree(src_qdrant, QDRANT_PATH)
44
+ print(f"Qdrant storage downloaded to {QDRANT_PATH}")
45
+
46
+ print("Data download complete!")
47
+
48
+ # Download data first
49
+ download_data()
50
+
51
+ # Load model and databases
52
+ print("Loading model...")
53
+ model = SentenceTransformer(MODEL_PATH)
54
+ print("Model loaded!")
55
+
56
+ print("Connecting to Qdrant...")
57
+ qdrant = QdrantClient(path=QDRANT_PATH)
58
+ print("Qdrant connected!")
59
+
60
+ def get_card_details(card_ids):
61
+ """Fetch card details from SQLite"""
62
+ if not card_ids:
63
+ return {}
64
+
65
+ conn = sqlite3.connect(SQLITE_PATH)
66
+ cursor = conn.cursor()
67
+
68
+ placeholders = ",".join(["?" for _ in card_ids])
69
+ query = f"""
70
+ SELECT ac.id, b.title, ac.full_text, ac.paragraph_id, ac.page_start
71
+ FROM atomized_cards ac
72
+ JOIN books b ON b.id = ac.book_id
73
+ WHERE ac.id IN ({placeholders})
74
+ """
75
+
76
+ cursor.execute(query, card_ids)
77
+ rows = cursor.fetchall()
78
+ conn.close()
79
+
80
+ results = {}
81
+ for row in rows:
82
+ card_id, title, text, para_id, page = row
83
+ results[card_id] = {
84
+ "title": title,
85
+ "text": text,
86
+ "reference": para_id or f"p{page}"
87
+ }
88
+ return results
89
+
90
+ def search(query, top_k=5):
91
+ """Semantic search for usuli texts"""
92
+ if not query or not query.strip():
93
+ return {"error": "No query provided"}
94
+
95
+ # Encode query
96
+ query_vector = model.encode(query).tolist()
97
+
98
+ # Search in Qdrant
99
+ results = qdrant.search(
100
+ collection_name="usuli_cards",
101
+ query_vector=query_vector,
102
+ limit=int(top_k),
103
+ with_payload=True
104
+ )
105
+
106
+ # Get card IDs
107
+ card_ids = [r.payload.get("id") for r in results if r.payload]
108
+
109
+ # Fetch details from SQLite
110
+ card_details = get_card_details(card_ids)
111
+
112
+ # Format results
113
+ hits = []
114
+ for r in results:
115
+ card_id = r.payload.get("id") if r.payload else None
116
+ if card_id and card_id in card_details:
117
+ details = card_details[card_id]
118
+ hits.append({
119
+ "title": details["title"],
120
+ "text": details["text"][:500] + "..." if len(details["text"]) > 500 else details["text"],
121
+ "reference": details["reference"],
122
+ "score": round(r.score, 4)
123
+ })
124
+
125
+ # Build answer
126
+ if hits:
127
+ answer_parts = []
128
+ for h in hits[:3]:
129
+ answer_parts.append(f"{h['title']} ({h['reference']}): {h['text'][:200]}...")
130
+ answer = "\n\n".join(answer_parts)
131
+ else:
132
+ answer = "لم يتم العثور على نتائج"
133
+
134
+ return {
135
+ "query": query,
136
+ "answer": answer,
137
+ "results": hits
138
+ }
139
+
140
+ # Gradio interface
141
+ with gr.Blocks(title="Bayan Usuli Search API", theme=gr.themes.Soft()) as demo:
142
+ gr.Markdown("# بيان - البحث الدلالي في علم الأصول")
143
+ gr.Markdown("ابحث في النصوص الأصولية باستخدام الذكاء الاصطناعي")
144
+
145
+ with gr.Row():
146
+ query_input = gr.Textbox(
147
+ label="السؤال",
148
+ placeholder="اكتب سؤالك الأصولي هنا...",
149
+ rtl=True,
150
+ lines=2
151
+ )
152
+
153
+ with gr.Row():
154
+ top_k = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="عدد النتائج")
155
+ search_btn = gr.Button("بحث", variant="primary")
156
+
157
+ output = gr.JSON(label="النتائج")
158
+
159
+ search_btn.click(search, inputs=[query_input, top_k], outputs=output)
160
+ query_input.submit(search, inputs=[query_input, top_k], outputs=output)
161
+
162
+ demo.launch()
qdrant_storage/.lock ADDED
@@ -0,0 +1 @@
 
 
1
+ tmp lock file
qdrant_storage/collection/usuli_cards/storage.sqlite ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6738bad99786d8934b5fb34b9b3bcade57fabb6d5b53b3203d528fc2532af569
3
+ size 542756864
qdrant_storage/meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"collections": {"usuli_cards": {"vectors": {"size": 768, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null}}, "aliases": {}}
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ sentence-transformers>=2.2.0
3
+ qdrant-client>=1.7.0
4
+ huggingface_hub>=0.20.0
5
+ torch
6
+ numpy
usuli_library.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c391b99bf14a65a5e08811d1c2d11ca11df96826765d5e25a444a6d4153b3ff1
3
+ size 143626240