Hyeonseo commited on
Commit
a6b603e
·
verified ·
1 Parent(s): 27edc5f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +275 -0
app.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Open Dataset Finder (HF / Zenodo / Kaggle) with Gradio MCP enabled
3
+
4
+ import os, io, re, html, time, csv, subprocess, string, typing as T, json
5
+ from dataclasses import dataclass
6
+ from datetime import datetime
7
+
8
+ import requests
9
+ import pandas as pd
10
+ from rapidfuzz import fuzz
11
+ from rank_bm25 import BM25Okapi
12
+ from huggingface_hub import list_datasets, HfApi
13
+ import gradio as gr
14
+
15
+ # -------------------- Common Utilities --------------------
16
+ def to_dt_str(x) -> str:
17
+ """Safely convert datetime or string into YYYY-MM-DD."""
18
+ if not x:
19
+ return ""
20
+ if isinstance(x, datetime):
21
+ return x.strftime("%Y-%m-%d")
22
+ s = str(x)
23
+ for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S", "%Y/%m/%d", "%d/%m/%Y"):
24
+ try:
25
+ return datetime.strptime(s.replace("Z",""), fmt).strftime("%Y-%m-%d")
26
+ except:
27
+ pass
28
+ return s[:10]
29
+
30
+ def tokenize(s: str) -> T.List[str]:
31
+ s = (s or "").lower()
32
+ for ch in string.punctuation:
33
+ s = s.replace(ch, " ")
34
+ return [w for w in s.split() if w]
35
+
36
+ # -------------------- Standard Schema --------------------
37
+ @dataclass
38
+ class Row:
39
+ source: str
40
+ id: str
41
+ title: str
42
+ description: str
43
+ updated: str
44
+ url: str
45
+ download_url: str
46
+ formats: T.List[str]
47
+
48
+ # -------------------- Hugging Face (datasets) --------------------
49
+ def search_hf(q, limit=40):
50
+ """Use list_datasets → optionally enrich with dataset_info."""
51
+ out = []
52
+ api = HfApi()
53
+ try:
54
+ ds_list = list_datasets(search=q, limit=limit)
55
+ except Exception as e:
56
+ print("HF list_datasets error:", e)
57
+ return out
58
+
59
+ for d in ds_list:
60
+ ds_id = getattr(d, "id", None) or ""
61
+ title = ds_id
62
+ url = f"https://huggingface.co/datasets/{ds_id}"
63
+ updated = to_dt_str(getattr(d, "lastModified", None) or getattr(d, "updated_at", None))
64
+
65
+ desc = ""
66
+ fmts = []
67
+ try:
68
+ info = api.dataset_info(ds_id, timeout=15)
69
+ card = getattr(info, "cardData", None) or {}
70
+ desc = (card.get("description") if isinstance(card, dict) else "") or ""
71
+ updated = to_dt_str(getattr(info, "lastModified", None) or getattr(info, "updated_at", None)) or updated
72
+ except Exception:
73
+ pass
74
+
75
+ out.append(Row("huggingface", ds_id, title, desc, updated, url, "", fmts))
76
+ return out
77
+
78
+ # -------------------- Zenodo --------------------
79
+ SAFE_TIMEOUT=20
80
+ UA={"User-Agent":"OpenDatasetFinder/mini/0.2 (+HF Space)"}
81
+
82
+ def safe_get(url, params=None, timeout=SAFE_TIMEOUT, retries=2):
83
+ for i in range(retries+1):
84
+ try:
85
+ r = requests.get(url, params=params, headers=UA, timeout=timeout)
86
+ r.raise_for_status()
87
+ return r
88
+ except Exception:
89
+ if i==retries:
90
+ raise
91
+ time.sleep(1.2*(i+1))
92
+
93
+ def search_zenodo(q, limit=40):
94
+ base="https://zenodo.org/api/records"
95
+ r = safe_get(base, params={"q":q, "type":"dataset", "size":limit})
96
+ hits = r.json().get("hits",{}).get("hits",[])
97
+ out=[]
98
+ for h in hits:
99
+ md=h.get("metadata",{}) or {}
100
+ title = md.get("title") or h.get("title") or ""
101
+ desc = re.sub(r"<[^>]+>"," ", html.unescape(md.get("description") or "")).strip()
102
+ url = (h.get("links",{}) or {}).get("html","")
103
+ files = h.get("files") or []
104
+ fmts = list({(f.get("type") or f.get("mimetype") or "").split("/")[-1] for f in files if f})
105
+ dl = files[0].get("links",{}).get("self","") if files else ""
106
+ upd = to_dt_str(h.get("updated"))
107
+ out.append(Row("zenodo", str(h.get("id") or ""), title, desc, upd, url, dl, [f for f in fmts if f]))
108
+ return out
109
+
110
+ # -------------------- Kaggle (env creds auto) --------------------
111
+ def ensure_kaggle_credentials():
112
+ """If env vars exist, create ~/.kaggle/kaggle.json with correct permissions."""
113
+ path = os.path.expanduser("~/.kaggle/kaggle.json")
114
+ if os.path.exists(path):
115
+ return
116
+ user = os.environ.get("KAGGLE_USERNAME")
117
+ key = os.environ.get("KAGGLE_KEY")
118
+ if not (user and key):
119
+ return
120
+ os.makedirs(os.path.dirname(path), exist_ok=True)
121
+ with open(path, "w") as f:
122
+ json.dump({"username": user, "key": key}, f)
123
+ os.chmod(path, 0o600)
124
+
125
+ def kaggle_available():
126
+ cred_path = os.path.expanduser("~/.kaggle/kaggle.json")
127
+ return bool(os.environ.get("KAGGLE_USERNAME") and os.environ.get("KAGGLE_KEY")) or os.path.exists(cred_path)
128
+
129
+ def search_kaggle(q, limit=40):
130
+ """API first → fallback CLI if empty/failure."""
131
+ rows=[]
132
+ try:
133
+ ensure_kaggle_credentials()
134
+ from kaggle.api.kaggle_api_extended import KaggleApi
135
+ api=KaggleApi(); api.authenticate()
136
+
137
+ try:
138
+ api_res = api.dataset_list(search=q, page=1)
139
+ except TypeError:
140
+ api_res = []
141
+
142
+ if api_res:
143
+ for d in api_res[:limit]:
144
+ try:
145
+ m = api.dataset_view(d.ref)
146
+ desc=(getattr(m, "description", "") or "").strip()
147
+ upd = to_dt_str(getattr(m, "lastUpdated", None))
148
+ except Exception:
149
+ desc, upd = "", ""
150
+ fmts=[]
151
+ try:
152
+ files=api.dataset_list_files(d.ref).files
153
+ for f in files:
154
+ ext=(f.name.split(".")[-1] if "." in f.name else "").lower()
155
+ if ext: fmts.append(ext)
156
+ fmts = sorted(set(fmts))
157
+ except Exception:
158
+ pass
159
+ url=f"https://www.kaggle.com/datasets/{d.ref}"
160
+ rows.append(Row("kaggle", d.ref, d.title or d.ref, desc, upd, url, url, fmts))
161
+ return rows
162
+ except Exception:
163
+ pass
164
+
165
+ try:
166
+ cli = subprocess.run(
167
+ ["kaggle", "datasets", "list", "-s", q, "--csv", "-p", "1", "-r", str(max(20, min(100, limit)))],
168
+ capture_output=True, text=True
169
+ )
170
+ if cli.returncode == 0 and cli.stdout.strip():
171
+ f = io.StringIO(cli.stdout)
172
+ reader = csv.DictReader(f)
173
+ for i, r in enumerate(reader):
174
+ if i >= limit:
175
+ break
176
+ title = r.get("title") or ""
177
+ url = r.get("url") or ""
178
+ ref = "/".join(url.rstrip("/").split("/")[-2:]) if "/datasets/" in url else url
179
+ rows.append(Row(
180
+ "kaggle",
181
+ ref,
182
+ title,
183
+ (r.get("subtitle") or "").strip(),
184
+ (r.get("lastUpdated") or "")[:10],
185
+ url,
186
+ url,
187
+ []
188
+ ))
189
+ except Exception:
190
+ pass
191
+
192
+ return rows
193
+
194
+ # -------------------- Ranking --------------------
195
+ def rank(q: str, rows: T.List[Row]):
196
+ if not rows:
197
+ return pd.DataFrame(columns=["source","id","title","description","updated","url","download_url","formats","score"])
198
+ docs=[tokenize(r.title+" "+r.description) for r in rows]
199
+ bm25=BM25Okapi(docs)
200
+ qtok=tokenize(q)
201
+ bm=bm25.get_scores(qtok)
202
+ mx=max(bm) if len(bm)>0 else 1.0
203
+ scored=[]
204
+ for i,r in enumerate(rows):
205
+ fz=fuzz.token_set_ratio(q, r.title+" "+r.description)/100.0
206
+ rec=0.0
207
+ try:
208
+ if r.updated:
209
+ days=(datetime.utcnow()-datetime.strptime(r.updated,"%Y-%m-%d")).days
210
+ rec=max(0.0, 1.0-min(days,365)/365.0)
211
+ except:
212
+ pass
213
+ score=0.6*(bm[i]/(mx+1e-9))+0.35*fz+0.05*rec
214
+ scored.append([r.source,r.id,r.title,r.description[:500],r.updated,r.url,r.download_url,", ".join(r.formats), round(float(score),4)])
215
+ df=pd.DataFrame(scored, columns=["source","id","title","description","updated","url","download_url","formats","score"])
216
+ return df.sort_values("score", ascending=False).reset_index(drop=True)
217
+
218
+ # -------------------- Gradio UI --------------------
219
+ with gr.Blocks(title="Open Dataset Finder (HF • Zenodo • Kaggle)") as demo:
220
+ gr.Markdown("### Search Hugging Face & Zenodo by default. Kaggle is enabled if credentials exist (API → CLI fallback).")
221
+ with gr.Row():
222
+ q = gr.Textbox(label="Query / Idea", value="korean weather")
223
+ k = gr.Slider(10, 200, value=40, step=10, label="Results per source")
224
+ with gr.Row():
225
+ use_hf = gr.Checkbox(value=True, label="Hugging Face")
226
+ use_zen = gr.Checkbox(value=True, label="Zenodo")
227
+ use_kg = gr.Checkbox(value=False, label="Kaggle")
228
+ btn = gr.Button("Search", variant="primary")
229
+ out = gr.Dataframe(wrap=True)
230
+ log = gr.Textbox(label="Logs", lines=8)
231
+
232
+ def do_search(q_, k_, u_hf, u_zen, u_kg):
233
+ logs=[]
234
+ rows=[]
235
+ try:
236
+ if u_hf:
237
+ logs.append("Searching Hugging Face…")
238
+ rows+=search_hf(q_, int(k_))
239
+ except Exception as e:
240
+ logs.append(f"HF error: {e}")
241
+
242
+ try:
243
+ if u_zen:
244
+ logs.append("Searching Zenodo…")
245
+ rows+=search_zenodo(q_, int(k_))
246
+ except Exception as e:
247
+ logs.append(f"Zenodo error: {e}")
248
+
249
+ if u_kg:
250
+ if kaggle_available():
251
+ try:
252
+ logs.append("Searching Kaggle…")
253
+ rows+=search_kaggle(q_, int(k_))
254
+ except Exception as e:
255
+ logs.append(f"Kaggle error: {e}")
256
+ else:
257
+ logs.append("No Kaggle credentials found → skipped")
258
+
259
+ df=rank(q_, rows)
260
+ logs.append(f"Total {len(df)} results")
261
+ return df, "\n".join(logs)
262
+
263
+ btn.click(do_search, inputs=[q,k,use_hf,use_zen,use_kg], outputs=[out, log])
264
+
265
+ # -------------------- Run (Gradio + MCP) --------------------
266
+ # On Hugging Face Spaces, calling launch() here is fine.
267
+ if __name__ == "__main__":
268
+ # IMPORTANT: mcp_server=True exposes MCP endpoints while keeping UI at root.
269
+ demo.queue().launch(
270
+ server_name="0.0.0.0",
271
+ server_port=7860,
272
+ show_error=True,
273
+ debug=False,
274
+ mcp_server=True,
275
+ )