pasxalisag commited on
Commit
8aeda0c
·
verified ·
1 Parent(s): 5fdee4f

Delete build_hf.py

Browse files
Files changed (1) hide show
  1. build_hf.py +0 -200
build_hf.py DELETED
@@ -1,200 +0,0 @@
1
- """
2
- Build script optimized for Hugging Face Spaces deployment
3
- Maintains the exact same SOTA RAG architecture
4
- """
5
- import os
6
- import sys
7
- import logging
8
- import pickle
9
- import json
10
- import numpy as np
11
- import torch
12
- from pathlib import Path
13
-
14
- # Add parent directory to path
15
- sys.path.append('.')
16
-
17
- from app import (
18
- load_opc_datasets,
19
- build_retrieval_system,
20
- ARTIFACT_DIR,
21
- FAISS_AVAILABLE,
22
- MODEL_NAME,
23
- EMBED_MODEL,
24
- MAX_CORPUS_SIZE
25
- )
26
-
27
- # Configure logging
28
- logging.basicConfig(
29
- level=logging.INFO,
30
- format='%(asctime)s - %(levelname)s - %(message)s',
31
- handlers=[
32
- logging.StreamHandler(sys.stdout),
33
- logging.FileHandler('/data/build.log')
34
- ]
35
- )
36
- logger = logging.getLogger(__name__)
37
-
38
- def check_artifacts():
39
- """Check if artifacts already exist"""
40
- required_files = [
41
- "corpus_data.json",
42
- "corpus_embeddings.npy",
43
- "answer_embeddings.npy",
44
- "bm25.pkl"
45
- ]
46
-
47
- if FAISS_AVAILABLE:
48
- required_files.append("faiss_index.bin")
49
-
50
- all_exist = all(os.path.exists(os.path.join(ARTIFACT_DIR, f)) for f in required_files)
51
- return all_exist
52
-
53
- def build_retrieval_with_progress():
54
- """Build retrieval system with progress tracking"""
55
- logger.info("Building SOTA RAG Retrieval System for Coding Assistant")
56
- logger.info(f"Architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval")
57
- logger.info(f"Embedding Model: {EMBED_MODEL}")
58
- logger.info(f"Max Corpus Size: {MAX_CORPUS_SIZE}")
59
-
60
- # Load datasets
61
- logger.info("Loading coding datasets...")
62
- ds_map = load_opc_datasets()
63
-
64
- # Build retrieval system (using the exact same function from app.py)
65
- logger.info("Building retrieval system...")
66
- retrieval_system = build_retrieval_system(ds_map)
67
-
68
- logger.info("Retrieval system built successfully!")
69
- logger.info(f" - Corpus size: {len(retrieval_system.corpus_texts)}")
70
- logger.info(f" - Embedding dimension: {retrieval_system.corpus_embeddings.shape[1]}")
71
- logger.info(f" - FAISS index: {'Yes' if retrieval_system.faiss_index else 'No'}")
72
-
73
- return retrieval_system
74
-
75
- def prepare_llm_artifacts():
76
- """Prepare LLM artifacts without downloading the full model"""
77
- logger.info("🤖 Preparing LLM configuration...")
78
-
79
- from transformers import AutoTokenizer, GenerationConfig
80
-
81
- llm_path = os.path.join(ARTIFACT_DIR, "llm_model")
82
- os.makedirs(llm_path, exist_ok=True)
83
-
84
- # Download and save tokenizer
85
- logger.info(f"📥 Downloading tokenizer for {MODEL_NAME}...")
86
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
87
-
88
- if tokenizer.pad_token is None:
89
- tokenizer.pad_token = tokenizer.eos_token
90
-
91
- # Use the exact same chat template from app.py
92
- tokenizer.chat_template = (
93
- "{% for message in messages %}"
94
- "{{'<|'+message['role']+'|>\\n'+message['content']+'</s>\\n'}}"
95
- "{% endfor %}"
96
- "{% if add_generation_prompt %}"
97
- "<|assistant|>\n"
98
- "{% endif %}"
99
- )
100
-
101
- # Use the exact same generation config from app.py
102
- generation_config = GenerationConfig(
103
- max_new_tokens=300,
104
- temperature=0.7,
105
- top_p=0.9,
106
- do_sample=True,
107
- repetition_penalty=1.15,
108
- pad_token_id=tokenizer.pad_token_id
109
- )
110
-
111
- # Save tokenizer and config
112
- tokenizer.save_pretrained(llm_path)
113
- generation_config.save_pretrained(llm_path)
114
-
115
- # Create minimal config file
116
- config = {
117
- "_name_or_path": MODEL_NAME,
118
- "architectures": ["LlamaForCausalLM"],
119
- "model_type": "llama",
120
- "torch_dtype": "float16",
121
- "quantization_config": {
122
- "load_in_4bit": True,
123
- "bnb_4bit_compute_dtype": "float32",
124
- "bnb_4bit_use_double_quant": True,
125
- "bnb_4bit_quant_type": "nf4"
126
- } if torch.cuda.is_available() else {}
127
- }
128
-
129
- config_path = os.path.join(llm_path, "config.json")
130
- with open(config_path, "w") as f:
131
- json.dump(config, f, indent=2)
132
-
133
- logger.info(f"LLM configuration saved to {llm_path}")
134
- logger.info("Note: Full model will be downloaded at runtime with 4-bit quantization")
135
-
136
- def verify_artifacts():
137
- """Verify all artifacts are properly built"""
138
- logger.info("🔍 Verifying artifacts...")
139
-
140
- files_to_check = {
141
- "corpus_data.json": "Corpus data",
142
- "corpus_embeddings.npy": "Question embeddings",
143
- "answer_embeddings.npy": "Answer embeddings",
144
- "bm25.pkl": "BM25 index",
145
- "faiss_index.bin": "FAISS index"
146
- }
147
-
148
- for file, description in files_to_check.items():
149
- path = os.path.join(ARTIFACT_DIR, file)
150
- if os.path.exists(path):
151
- size_mb = os.path.getsize(path) / (1024 * 1024)
152
- logger.info(f" ✓ {description}: {size_mb:.2f} MB")
153
- else:
154
- if file != "faiss_index.bin" or FAISS_AVAILABLE:
155
- logger.warning(f" ✗ Missing: {description}")
156
-
157
- def main():
158
- """Main build process"""
159
- logger.info("=" * 60)
160
- logger.info("🤖 Codey Bryant 3.0 - SOTA RAG Build Script")
161
- logger.info("=" * 60)
162
-
163
- # Create artifacts directory
164
- os.makedirs(ARTIFACT_DIR, exist_ok=True)
165
-
166
- # Check if we need to rebuild
167
- if check_artifacts():
168
- logger.info("Artifacts already exist. Skipping build.")
169
- logger.info("Delete artifacts to force rebuild.")
170
- else:
171
- logger.info("Building fresh artifacts...")
172
-
173
- # Build retrieval system
174
- build_retrieval_with_progress()
175
-
176
- # Prepare LLM artifacts
177
- prepare_llm_artifacts()
178
-
179
- logger.info("Build complete!")
180
-
181
- # Verify artifacts
182
- verify_artifacts()
183
-
184
- # Show total size
185
- logger.info("\nArtifact Summary:")
186
- total_size = 0
187
- for root, dirs, files in os.walk(ARTIFACT_DIR):
188
- for file in files:
189
- filepath = os.path.join(root, file)
190
- size_mb = os.path.getsize(filepath) / (1024 * 1024)
191
- total_size += size_mb
192
-
193
- logger.info(f" Total size: {total_size:.2f} MB")
194
- logger.info("=" * 60)
195
- logger.info("Ready to launch Codey Bryant!")
196
- logger.info(" Run: python app.py")
197
- logger.info("=" * 60)
198
-
199
- if __name__ == "__main__":
200
- main()