File size: 5,778 Bytes
56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 41f5c9b 56037a2 41f5c9b 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 2d50652 56037a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
"""
ORD Reagent Index Builder - Gradio App (Simplified)
Runs directly on Hugging Face Spaces
"""
import gradio as gr
import os
import sys
from collections import defaultdict
from datasets import load_dataset, Dataset
from huggingface_hub import login
# Check for HF_TOKEN
HF_TOKEN = os.getenv("HF_TOKEN")
# Configuration
ORIGINAL_DATASET = "smitathkr1/ord-reactions"
HF_DATASET_NAME = "smitathkr1/ord-reagent-index"
SAMPLE_SIZE = None # Set to 100 for testing
def build_index():
"""Build the reagent index."""
if not HF_TOKEN:
return "ERROR: HF_TOKEN not found. Add it to Space secrets.", []
try:
print("\n" + "="*70)
print("Starting ORD Reagent Index Builder")
print("="*70 + "\n")
# 1. Auth
print("[1/5] Authenticating with Hugging Face...")
login(token=HF_TOKEN)
print(" OK - Authenticated\n")
# 2. Load dataset
print("[2/5] Loading dataset in streaming mode...")
ds = load_dataset(ORIGINAL_DATASET, split='train', streaming=True)
print(" OK - Dataset loaded\n")
# 3. Process reactions
print("[3/5] Processing reactions...")
print(" This will take 10-20 minutes, please wait...\n")
smiles_to_reactions = defaultdict(list)
name_to_reactions = defaultdict(list)
reagent_cache = {}
try:
import pubchempy as pcp
PUBCHEM_AVAILABLE = True
except ImportError:
PUBCHEM_AVAILABLE = False
processed = 0
for reaction in ds:
processed += 1
if SAMPLE_SIZE and processed > SAMPLE_SIZE:
break
if processed % 100000 == 0:
print(f" [{processed:,}] reactions processed...")
reaction_id = reaction.get('reaction_id')
# Extract SMILES
for smiles in (reaction.get('inputs_smiles', []) + reaction.get('products_smiles', [])):
if isinstance(smiles, str) and smiles.strip():
smiles_lower = smiles.lower().strip()
smiles_to_reactions[smiles_lower].append(reaction_id)
print(f"\n Total: {processed:,} reactions processed\n")
# 4. Build index
print("[4/5] Building index...")
index_entries = []
for smiles, reaction_ids in smiles_to_reactions.items():
unique_ids = list(set(reaction_ids))
index_entries.append({
'search_term': smiles,
'search_type': 'smiles',
'reaction_ids': unique_ids,
'count': len(unique_ids)
})
print(f" Created {len(index_entries):,} index entries\n")
# 5. Upload
print("[5/5] Uploading to Hugging Face...")
index_dataset = Dataset.from_list(index_entries)
index_dataset.push_to_hub(HF_DATASET_NAME, private=False, token=HF_TOKEN)
print(" OK - Upload complete\n")
# Summary
print("="*70)
print("SUCCESS!")
print("="*70)
print(f"Total reactions: {processed:,}")
print(f"Index entries: {len(index_entries):,}")
print(f"Dataset: https://huggingface.co/datasets/{HF_DATASET_NAME}\n")
# Sample data
sample_data = []
for i, entry in enumerate(index_entries[:10]):
sample_data.append([
entry['search_term'][:30],
entry['search_type'],
entry['count']
])
# Get all output from print statements
import io
import contextlib
return "✅ Index built successfully! Check logs above.", sample_data
except Exception as e:
error_msg = f"ERROR: {str(e)}\n\nDetails:\n{type(e).__name__}"
print(f"\n{error_msg}\n")
import traceback
traceback.print_exc()
return error_msg, []
# Create Gradio interface
with gr.Blocks(title="ORD Reagent Index Builder", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧪 ORD Reagent Index Builder")
gr.Markdown("Create fast search index for 2.7M reactions on Hugging Face Spaces")
with gr.Row():
with gr.Column():
gr.Markdown("""
### About
This tool creates a fast search index for the Open Reaction Database.
**Features:**
- Streams 2.7M reactions (no memory issues)
- SMILES indexing
- Auto-uploads to Hugging Face
**Time:** ~10-20 minutes
""")
with gr.Column():
gr.Markdown("""
### Links
[ORD Dataset](https://huggingface.co/datasets/smitathkr1/ord-reactions)
[Index Dataset](https://huggingface.co/datasets/smitathkr1/ord-reagent-index)
""")
gr.Markdown("---")
# Main button
start_btn = gr.Button("🚀 Start Building Index", size="lg", variant="primary")
gr.Markdown("---")
# Output
gr.Markdown("### Output")
status_output = gr.Textbox(
label="Status",
lines=3,
interactive=False,
placeholder="Click start button..."
)
gr.Markdown("### Sample Index Entries (First 10)")
table_output = gr.Dataframe(
headers=["Search Term", "Type", "Count"],
interactive=False
)
# Event handler
start_btn.click(
fn=build_index,
outputs=[status_output, table_output]
)
if __name__ == "__main__":
demo.launch(share=False)
|