davidtran999 commited on
Commit
66cb7c5
·
verified ·
1 Parent(s): 2e00b5b

Upload backend/scripts/benchmark_search.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. backend/scripts/benchmark_search.py +104 -0
backend/scripts/benchmark_search.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import json
5
+ from pathlib import Path
6
+ import statistics
7
+
8
+ # Ensure project root on path
9
+ ROOT_DIR = Path(__file__).resolve().parents[2]
10
+ BACKEND_DIR = ROOT_DIR / "backend"
11
+ HUE_PORTAL_DIR = BACKEND_DIR / "hue_portal"
12
+
13
+ for path in (HUE_PORTAL_DIR, BACKEND_DIR, ROOT_DIR):
14
+ if str(path) not in sys.path:
15
+ sys.path.insert(0, str(path))
16
+
17
+ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "hue_portal.hue_portal.settings")
18
+
19
+ import django
20
+
21
+ django.setup()
22
+
23
+ from django.db import connection
24
+ from hue_portal.core.models import Procedure, Fine, Office, Advisory
25
+ from hue_portal.core.search_ml import search_with_ml
26
+
27
+
28
+ QUERIES = {
29
+ "procedure": [
30
+ "đăng ký cư trú",
31
+ "thủ tục pccc",
32
+ "giấy tờ antt",
33
+ ],
34
+ "fine": [
35
+ "mức phạt nồng độ cồn",
36
+ "vượt đèn đỏ",
37
+ "không đội mũ bảo hiểm",
38
+ ],
39
+ "office": [
40
+ "công an phường",
41
+ "điểm tiếp dân",
42
+ ],
43
+ "advisory": [
44
+ "cảnh báo lừa đảo",
45
+ "giả mạo công an",
46
+ ],
47
+ }
48
+
49
+
50
+ def run_benchmark(iterations: int = 3):
51
+ results = {
52
+ "database_vendor": connection.vendor,
53
+ "timestamp": time.time(),
54
+ "iterations": iterations,
55
+ "entries": [],
56
+ }
57
+
58
+ datasets = {
59
+ "procedure": (Procedure.objects.all(), ["title", "domain", "conditions", "dossier"]),
60
+ "fine": (Fine.objects.all(), ["name", "code", "article", "decree", "remedial"]),
61
+ "office": (Office.objects.all(), ["unit_name", "address", "district", "service_scope"]),
62
+ "advisory": (Advisory.objects.all(), ["title", "summary"]),
63
+ }
64
+
65
+ for dataset, queries in QUERIES.items():
66
+ qs, fields = datasets[dataset]
67
+ for query in queries:
68
+ durations = []
69
+ counts = []
70
+ for _ in range(iterations):
71
+ start = time.perf_counter()
72
+ items = list(search_with_ml(qs, query, fields, top_k=20))
73
+ durations.append(time.perf_counter() - start)
74
+ counts.append(len(items))
75
+
76
+ results["entries"].append(
77
+ {
78
+ "dataset": dataset,
79
+ "query": query,
80
+ "avg_duration_ms": statistics.mean(durations) * 1000,
81
+ "p95_duration_ms": statistics.quantiles(durations, n=20)[18] * 1000 if len(durations) >= 20 else max(durations) * 1000,
82
+ "min_duration_ms": min(durations) * 1000,
83
+ "max_duration_ms": max(durations) * 1000,
84
+ "avg_results": statistics.mean(counts),
85
+ }
86
+ )
87
+
88
+ return results
89
+
90
+
91
+ def main():
92
+ iterations = int(os.environ.get("BENCH_ITERATIONS", "3"))
93
+ benchmark = run_benchmark(iterations=iterations)
94
+
95
+ output_dir = ROOT_DIR / "logs" / "benchmarks"
96
+ output_dir.mkdir(parents=True, exist_ok=True)
97
+ output_file = output_dir / f"search_benchmark_{int(benchmark['timestamp'])}.json"
98
+ output_file.write_text(json.dumps(benchmark, ensure_ascii=False, indent=2))
99
+
100
+ print(f"Benchmark completed. Results saved to {output_file}")
101
+
102
+
103
+ if __name__ == "__main__":
104
+ main()