File size: 5,778 Bytes
56037a2
2d50652
 
56037a2
 
 
 
2d50652
56037a2
 
 
 
 
 
 
 
 
 
 
 
2d50652
 
56037a2
 
2d50652
56037a2
 
2d50652
 
 
56037a2
2d50652
 
56037a2
2d50652
56037a2
 
2d50652
56037a2
2d50652
56037a2
 
2d50652
 
56037a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d50652
 
56037a2
2d50652
56037a2
2d50652
 
 
 
 
56037a2
2d50652
56037a2
 
2d50652
56037a2
 
 
 
 
 
 
 
2d50652
56037a2
 
2d50652
56037a2
2d50652
 
56037a2
 
2d50652
56037a2
2d50652
 
 
 
 
 
 
56037a2
2d50652
56037a2
 
 
 
 
 
 
 
2d50652
 
 
 
 
56037a2
 
2d50652
 
56037a2
2d50652
 
56037a2
 
 
 
2d50652
56037a2
 
 
 
2d50652
56037a2
 
 
 
 
 
 
 
 
 
 
 
2d50652
56037a2
 
2d50652
56037a2
 
41f5c9b
56037a2
 
 
 
41f5c9b
56037a2
2d50652
 
 
 
 
56037a2
2d50652
56037a2
 
 
2d50652
56037a2
 
 
 
 
 
2d50652
 
56037a2
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""
ORD Reagent Index Builder - Gradio App (Simplified)
Runs directly on Hugging Face Spaces
"""

import gradio as gr
import os
import sys
from collections import defaultdict
from datasets import load_dataset, Dataset
from huggingface_hub import login

# Check for HF_TOKEN
HF_TOKEN = os.getenv("HF_TOKEN")

# Configuration
ORIGINAL_DATASET = "smitathkr1/ord-reactions"
HF_DATASET_NAME = "smitathkr1/ord-reagent-index"
SAMPLE_SIZE = None  # Set to 100 for testing

def build_index():
    """Build the reagent index."""
    
    if not HF_TOKEN:
        return "ERROR: HF_TOKEN not found. Add it to Space secrets.", []
    
    try:
        print("\n" + "="*70)
        print("Starting ORD Reagent Index Builder")
        print("="*70 + "\n")
        
        # 1. Auth
        print("[1/5] Authenticating with Hugging Face...")
        login(token=HF_TOKEN)
        print("      OK - Authenticated\n")
        
        # 2. Load dataset
        print("[2/5] Loading dataset in streaming mode...")
        ds = load_dataset(ORIGINAL_DATASET, split='train', streaming=True)
        print("      OK - Dataset loaded\n")
        
        # 3. Process reactions
        print("[3/5] Processing reactions...")
        print("      This will take 10-20 minutes, please wait...\n")
        
        smiles_to_reactions = defaultdict(list)
        name_to_reactions = defaultdict(list)
        reagent_cache = {}
        
        try:
            import pubchempy as pcp
            PUBCHEM_AVAILABLE = True
        except ImportError:
            PUBCHEM_AVAILABLE = False
        
        processed = 0
        
        for reaction in ds:
            processed += 1
            
            if SAMPLE_SIZE and processed > SAMPLE_SIZE:
                break
            
            if processed % 100000 == 0:
                print(f"      [{processed:,}] reactions processed...")
            
            reaction_id = reaction.get('reaction_id')
            
            # Extract SMILES
            for smiles in (reaction.get('inputs_smiles', []) + reaction.get('products_smiles', [])):
                if isinstance(smiles, str) and smiles.strip():
                    smiles_lower = smiles.lower().strip()
                    smiles_to_reactions[smiles_lower].append(reaction_id)
        
        print(f"\n      Total: {processed:,} reactions processed\n")
        
        # 4. Build index
        print("[4/5] Building index...")
        index_entries = []
        
        for smiles, reaction_ids in smiles_to_reactions.items():
            unique_ids = list(set(reaction_ids))
            index_entries.append({
                'search_term': smiles,
                'search_type': 'smiles',
                'reaction_ids': unique_ids,
                'count': len(unique_ids)
            })
        
        print(f"      Created {len(index_entries):,} index entries\n")
        
        # 5. Upload
        print("[5/5] Uploading to Hugging Face...")
        index_dataset = Dataset.from_list(index_entries)
        index_dataset.push_to_hub(HF_DATASET_NAME, private=False, token=HF_TOKEN)
        print("      OK - Upload complete\n")
        
        # Summary
        print("="*70)
        print("SUCCESS!")
        print("="*70)
        print(f"Total reactions: {processed:,}")
        print(f"Index entries: {len(index_entries):,}")
        print(f"Dataset: https://huggingface.co/datasets/{HF_DATASET_NAME}\n")
        
        # Sample data
        sample_data = []
        for i, entry in enumerate(index_entries[:10]):
            sample_data.append([
                entry['search_term'][:30],
                entry['search_type'],
                entry['count']
            ])
        
        # Get all output from print statements
        import io
        import contextlib
        
        return "✅ Index built successfully! Check logs above.", sample_data
        
    except Exception as e:
        error_msg = f"ERROR: {str(e)}\n\nDetails:\n{type(e).__name__}"
        print(f"\n{error_msg}\n")
        import traceback
        traceback.print_exc()
        return error_msg, []

# Create Gradio interface
with gr.Blocks(title="ORD Reagent Index Builder", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧪 ORD Reagent Index Builder")
    gr.Markdown("Create fast search index for 2.7M reactions on Hugging Face Spaces")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("""
            ### About
            This tool creates a fast search index for the Open Reaction Database.
            
            **Features:**
            - Streams 2.7M reactions (no memory issues)
            - SMILES indexing
            - Auto-uploads to Hugging Face
            
            **Time:** ~10-20 minutes
            """)
        
        with gr.Column():
            gr.Markdown("""
            ### Links
            [ORD Dataset](https://huggingface.co/datasets/smitathkr1/ord-reactions)
            
            [Index Dataset](https://huggingface.co/datasets/smitathkr1/ord-reagent-index)
            """)
    
    gr.Markdown("---")
    
    # Main button
    start_btn = gr.Button("🚀 Start Building Index", size="lg", variant="primary")
    
    gr.Markdown("---")
    
    # Output
    gr.Markdown("### Output")
    status_output = gr.Textbox(
        label="Status",
        lines=3,
        interactive=False,
        placeholder="Click start button..."
    )
    
    gr.Markdown("### Sample Index Entries (First 10)")
    table_output = gr.Dataframe(
        headers=["Search Term", "Type", "Count"],
        interactive=False
    )
    
    # Event handler
    start_btn.click(
        fn=build_index,
        outputs=[status_output, table_output]
    )

if __name__ == "__main__":
    demo.launch(share=False)