Spaces:
Running
Running
File size: 4,070 Bytes
cb4a1c8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | """
Export all arXiv IDs from Turso DB to arxiv_ids.txt.
Uses the same Turso HTTP pipeline API as turso_svc.py.
Paginates with LIMIT/OFFSET to handle 1.6M rows.
Usage:
set TURSO_URL=libsql://...
set TURSO_DB_TOKEN=...
python scripts/export_arxiv_ids.py
"""
import os
import sys
import time
import httpx
BATCH_SIZE = 50_000 # rows per query (Turso handles this fine)
OUTPUT_FILE = os.path.join(os.path.dirname(__file__), "..", "arxiv_ids.txt")
def get_turso_config():
url = os.getenv("TURSO_URL", "")
token = os.getenv("TURSO_DB_TOKEN", "")
if not url or not token:
print("ERROR: Set TURSO_URL and TURSO_DB_TOKEN environment variables.")
print(" Example:")
print(" set TURSO_URL=libsql://your-db.turso.io")
print(" set TURSO_DB_TOKEN=your-token")
sys.exit(1)
# Convert to HTTPS
if url.startswith("libsql://"):
url = "https://" + url[len("libsql://"):]
elif not url.startswith("https://"):
url = "https://" + url
return url.rstrip("/"), token
def turso_query(url: str, token: str, sql: str, args: list = None) -> list[list]:
"""Execute a query via Turso HTTP pipeline API. Returns list of rows."""
stmt = {"sql": sql}
if args:
stmt["args"] = args
payload = {
"requests": [
{"type": "execute", "stmt": stmt},
{"type": "close"},
]
}
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
}
resp = httpx.post(
f"{url}/v2/pipeline",
json=payload,
headers=headers,
timeout=30,
)
resp.raise_for_status()
data = resp.json()
# Parse response
result = data.get("results", [])
if not result:
return []
execute_result = result[0]
if execute_result.get("type") == "error":
raise RuntimeError(f"Turso error: {execute_result.get('error')}")
response = execute_result.get("response", {})
result_data = response.get("result", {})
rows = result_data.get("rows", [])
# Each row is a list of {"type": "text", "value": "..."} dicts
return [[col.get("value") for col in row] for row in rows]
def main():
url, token = get_turso_config()
# First, get total count
print("[export] Counting papers in Turso...")
count_rows = turso_query(url, token, "SELECT COUNT(*) FROM papers")
total = int(count_rows[0][0]) if count_rows else 0
print(f"[export] Found {total:,} papers")
if total == 0:
print("ERROR: No papers found. Check your Turso connection.")
sys.exit(1)
# Paginate and collect all IDs
all_ids = []
offset = 0
t0 = time.perf_counter()
while offset < total:
batch_start = time.perf_counter()
rows = turso_query(
url, token,
f"SELECT arxiv_id FROM papers LIMIT {BATCH_SIZE} OFFSET {offset}"
)
batch_ms = (time.perf_counter() - batch_start) * 1000
batch_ids = [row[0] for row in rows if row[0]]
all_ids.extend(batch_ids)
offset += BATCH_SIZE
pct = min(100, offset * 100 / total)
print(f"[export] {len(all_ids):>10,} / {total:,} ({pct:.0f}%) "
f"batch: {len(batch_ids):,} in {batch_ms:.0f}ms")
if len(rows) < BATCH_SIZE:
break # No more rows
elapsed = time.perf_counter() - t0
print(f"\n[export] Collected {len(all_ids):,} arXiv IDs in {elapsed:.1f}s")
# Write to file
output_path = os.path.abspath(OUTPUT_FILE)
with open(output_path, "w", encoding="utf-8") as f:
for aid in all_ids:
f.write(aid + "\n")
file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
print(f"[export] Written to: {output_path}")
print(f"[export] File size: {file_size_mb:.1f} MB")
print(f"[export] Lines: {len(all_ids):,}")
print(f"\n✅ Done! Feed this file to the ML Intern's Script 1:")
print(f" python 01_fetch_citation_edges.py --corpus-file arxiv_ids.txt")
if __name__ == "__main__":
main()
|