Veeru-c commited on
Commit
f512f65
·
1 Parent(s): 5aadfb4

initial commit

Browse files
docs/api_endpoint.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modal
2
+
3
+ app = modal.App("census-qa-api")
4
+ vol_checkpoints = modal.Volume.from_name("model-checkpoints")
5
+
6
+ image = modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") \
7
+ .apt_install("git") \
8
+ .run_commands(
9
+ "pip install --upgrade pip",
10
+ "pip install --upgrade pip packaging ninja psutil unsloth_zoo torchvision fastapi",
11
+ "pip install xformers trl peft accelerate bitsandbytes scipy huggingface_hub protobuf sentencepiece einops",
12
+ "pip install --no-deps 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'"
13
+ ) \
14
+ .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
15
+
16
+ @app.cls(image=image, volumes={"/data/checkpoints": vol_checkpoints}, gpu="A10G", keep_warm=1)
17
+ class Model:
18
+ @modal.enter()
19
+ def load(self):
20
+ from unsloth import FastLanguageModel
21
+ print("Loading model...")
22
+ self.model, self.tokenizer = FastLanguageModel.from_pretrained(
23
+ "/data/checkpoints/phi3-census-lora",
24
+ max_seq_length=2048,
25
+ dtype=None,
26
+ load_in_4bit=True,
27
+ )
28
+ FastLanguageModel.for_inference(self.model)
29
+ print("Model loaded!")
30
+
31
+ @modal.web_endpoint(method="POST")
32
+ def ask(self, data: dict):
33
+ try:
34
+ prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
35
+
36
+ ### Instruction:
37
+ {data.get('question', '')}
38
+
39
+ ### Input:
40
+ {data.get('context', 'Context: Japan Census data.')}
41
+
42
+ ### Response:
43
+ """
44
+ inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda")
45
+ outputs = self.model.generate(**inputs, max_new_tokens=150, temperature=0.1, use_cache=True)
46
+ response = self.tokenizer.batch_decode(outputs)[0]
47
+
48
+ if "### Response:\n" in response:
49
+ answer = response.split("### Response:\n")[1].split("<|endoftext|>")[0].strip()
50
+ else:
51
+ answer = response.strip()
52
+
53
+ return {"question": data.get('question'), "answer": answer}
54
+ except Exception as e:
55
+ print(f"Error: {str(e)}")
56
+ return {"error": str(e)}
57
+
58
+ @app.local_entrypoint()
59
+ def main():
60
+ print("To deploy: modal deploy docs/api_endpoint.py")
docs/api_endpoint_cpu.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modal
2
+
3
+ app = modal.App("census-qa-api-cpu")
4
+ vol_checkpoints = modal.Volume.from_name("model-checkpoints")
5
+
6
+ # CPU-only image (no CUDA)
7
+ image = modal.Image.debian_slim(python_version="3.10") \
8
+ .pip_install(
9
+ "torch",
10
+ "transformers",
11
+ "peft",
12
+ "accelerate",
13
+ "bitsandbytes",
14
+ "scipy",
15
+ "huggingface_hub",
16
+ "protobuf",
17
+ "sentencepiece",
18
+ "fastapi"
19
+ )
20
+
21
+ @app.cls(
22
+ image=image,
23
+ volumes={"/data/checkpoints": vol_checkpoints},
24
+ cpu=4, # Use CPU instead of GPU
25
+ memory=8192, # 8GB RAM
26
+ keep_warm=1
27
+ )
28
+ class ModelCPU:
29
+ @modal.enter()
30
+ def load(self):
31
+ from transformers import AutoModelForCausalLM, AutoTokenizer
32
+ from peft import PeftModel
33
+
34
+ print("Loading model on CPU...")
35
+
36
+ # Load base model
37
+ base_model = "microsoft/Phi-3-mini-4k-instruct"
38
+ self.tokenizer = AutoTokenizer.from_pretrained(base_model)
39
+
40
+ # Load with PEFT adapter (no quantization on CPU)
41
+ model = AutoModelForCausalLM.from_pretrained(
42
+ base_model,
43
+ torch_dtype="auto",
44
+ device_map="cpu"
45
+ )
46
+
47
+ # Load LoRA adapter
48
+ self.model = PeftModel.from_pretrained(
49
+ model,
50
+ "/data/checkpoints/phi3-census-lora"
51
+ )
52
+
53
+ print("Model loaded on CPU!")
54
+
55
+ @modal.web_endpoint(method="POST")
56
+ def ask(self, data: dict):
57
+ prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
58
+
59
+ ### Instruction:
60
+ {data.get('question', '')}
61
+
62
+ ### Input:
63
+ {data.get('context', 'Context: Japan Census data.')}
64
+
65
+ ### Response:
66
+ """
67
+
68
+ inputs = self.tokenizer([prompt], return_tensors="pt")
69
+ outputs = self.model.generate(**inputs, max_new_tokens=150, temperature=0.1)
70
+ response = self.tokenizer.batch_decode(outputs)[0]
71
+
72
+ if "### Response:\n" in response:
73
+ answer = response.split("### Response:\n")[1].split("<|endoftext|>")[0].strip()
74
+ else:
75
+ answer = response.strip()
76
+
77
+ return {"question": data.get('question'), "answer": answer}
78
+
79
+ @app.local_entrypoint()
80
+ def main():
81
+ print("CPU-based API endpoint")
82
+ print("Deploy with: modal deploy docs/api_endpoint_cpu.py")
83
+ print("Note: CPU inference is 10-20x slower than GPU")
docs/prepare_economy_data.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modal
2
+ import os
3
+ import random
4
+
5
+ app = modal.App("prepare-economy-data")
6
+
7
+ vol_economy = modal.Volume.from_name("economy-labor-data")
8
+ vol_dataset = modal.Volume.from_name("finetune-dataset", create_if_missing=True)
9
+
10
+ image = modal.Image.debian_slim().pip_install("pandas", "openpyxl")
11
+
12
+ @app.function(image=image, volumes={"/data/economy": vol_economy})
13
+ def list_csv_files() -> list:
14
+ """List only economy/labor CSV files"""
15
+ files = []
16
+ for root, _, filenames in os.walk("/data/economy"):
17
+ for f in filenames:
18
+ if f.lower().endswith('.csv'):
19
+ files.append({"path": os.path.join(root, f), "source": "Japan Economy & Labor"})
20
+ return files
21
+
22
+ @app.function(
23
+ image=image,
24
+ volumes={"/data/economy": vol_economy},
25
+ timeout=1200, # 20 minutes per file
26
+ max_containers=50 # Reduce parallelism to avoid timeouts
27
+ )
28
+ def process_file(file_info: dict) -> dict:
29
+ import pandas as pd
30
+ import re
31
+
32
+ file_path = file_info["path"]
33
+ source_name = file_info["source"]
34
+ data_points = []
35
+
36
+ def clean_value(val):
37
+ if pd.isna(val):
38
+ return None
39
+ val_str = str(val).strip()
40
+ val_str = re.sub(r'^\d+_', '', val_str) # Remove codes
41
+ val_str = re.sub(r'^np\.(int|float)\d*\((.+)\)$', r'\2', val_str) # Remove numpy wrappers
42
+ return val_str if val_str and val_str.lower() not in ['nan', 'none'] else None
43
+
44
+ try:
45
+ filename = os.path.basename(file_path)
46
+ filename_no_ext = os.path.splitext(filename)[0]
47
+ parts = filename_no_ext.split('_', 1)
48
+ title = parts[1].replace('_', ' ') if len(parts) > 1 else filename_no_ext
49
+
50
+ # Read CSV
51
+ try:
52
+ df = pd.read_csv(file_path, low_memory=False)
53
+ except:
54
+ return {"data": [], "columns": None}
55
+
56
+ if df.empty or len(df) < 3:
57
+ return {"data": [], "columns": None}
58
+
59
+ # Find data start row (adaptive parsing)
60
+ data_start_row = 0
61
+ for i in range(min(20, len(df))):
62
+ row = df.iloc[i]
63
+ non_null_count = row.count()
64
+ if non_null_count >= len(df.columns) * 0.3:
65
+ string_count = sum(1 for v in row if isinstance(v, str) and len(str(v)) > 0)
66
+ if string_count >= non_null_count * 0.5:
67
+ data_start_row = i
68
+ break
69
+
70
+ if data_start_row > 0:
71
+ new_headers = df.iloc[data_start_row].tolist()
72
+ df = df.iloc[data_start_row+1:].reset_index(drop=True)
73
+ df.columns = [clean_value(h) or f"Col_{i}" for i, h in enumerate(new_headers)]
74
+ else:
75
+ df.columns = [clean_value(col) or f"Col_{i}" for i, col in enumerate(df.columns)]
76
+
77
+ # Filter valid columns
78
+ valid_cols = [col for col in df.columns if col and not col.startswith("Col_")]
79
+
80
+ if len(valid_cols) < 2:
81
+ return {"data": [], "columns": None}
82
+
83
+ df = df[valid_cols]
84
+ df = df.dropna(how='all')
85
+
86
+ if len(df) == 0:
87
+ return {"data": [], "columns": None}
88
+
89
+ column_info = {
90
+ "file": filename,
91
+ "columns": list(valid_cols),
92
+ "row_count": len(df)
93
+ }
94
+
95
+ # Sample ALL rows (no limit) for maximum data
96
+ df_sample = df
97
+
98
+ label_col = df.columns[0]
99
+ value_cols = df.columns[1:]
100
+
101
+ for _, row in df_sample.iterrows():
102
+ row_label = clean_value(row[label_col])
103
+ if not row_label:
104
+ continue
105
+
106
+ # Try to find a valid value column
107
+ for _ in range(min(5, len(value_cols))):
108
+ col = random.choice(value_cols)
109
+ val = clean_value(row[col])
110
+
111
+ if val:
112
+ question = f"What is the {col} for {row_label}?"
113
+ answer = f"The {col} for {row_label} is {val}."
114
+
115
+ entry = {
116
+ "instruction": question,
117
+ "input": f"Context: {source_name} data from '{title}'.",
118
+ "output": answer
119
+ }
120
+ data_points.append(entry)
121
+ break
122
+ except Exception as e:
123
+ print(f"Error processing {file_path}: {str(e)}")
124
+
125
+ return {"data": data_points, "columns": column_info}
126
+
127
+ @app.local_entrypoint()
128
+ def main():
129
+ import json
130
+
131
+ print("Listing economy/labor files...")
132
+ files = list_csv_files.remote()
133
+ print(f"Found {len(files)} economy/labor files. Starting processing...")
134
+
135
+ batch_size = 500 # Smaller batches
136
+ total_train = 0
137
+ total_val = 0
138
+ all_columns = []
139
+
140
+ for batch_start in range(0, len(files), batch_size):
141
+ batch_end = min(batch_start + batch_size, len(files))
142
+ batch_files = files[batch_start:batch_end]
143
+
144
+ print(f"Processing batch {batch_start//batch_size + 1}/{(len(files)-1)//batch_size + 1} ({len(batch_files)} files)...")
145
+
146
+ batch_data = []
147
+ for result in process_file.map(batch_files):
148
+ batch_data.extend(result["data"])
149
+ if result["columns"]:
150
+ all_columns.append(result["columns"])
151
+
152
+ print(f"Batch generated {len(batch_data)} data points")
153
+
154
+ if not batch_data:
155
+ continue
156
+
157
+ random.shuffle(batch_data)
158
+ split_idx = int(len(batch_data) * 0.9)
159
+ train_batch = batch_data[:split_idx]
160
+ val_batch = batch_data[split_idx:]
161
+
162
+ save_batch.remote(train_batch, val_batch, batch_start == 0)
163
+
164
+ total_train += len(train_batch)
165
+ total_val += len(val_batch)
166
+
167
+ print(f"Saved {len(train_batch)} train, {len(val_batch)} val. Total: {total_train} train, {total_val} val")
168
+
169
+ print("Saving column documentation...")
170
+ save_column_docs.remote(all_columns)
171
+
172
+ print(f"✅ Done! Total: {total_train} train, {total_val} val")
173
+
174
+ @app.function(image=image, volumes={"/data/dataset": vol_dataset}, timeout=600)
175
+ def save_batch(train_data, val_data, is_first_batch):
176
+ import json
177
+ mode = 'w' if is_first_batch else 'a'
178
+
179
+ with open("/data/dataset/train.jsonl", mode, encoding='utf-8') as f:
180
+ for entry in train_data:
181
+ json.dump(entry, f, ensure_ascii=False)
182
+ f.write('\n')
183
+
184
+ with open("/data/dataset/val.jsonl", mode, encoding='utf-8') as f:
185
+ for entry in val_data:
186
+ json.dump(entry, f, ensure_ascii=False)
187
+ f.write('\n')
188
+
189
+ vol_dataset.commit()
190
+
191
+ @app.function(image=image, volumes={"/data/dataset": vol_dataset}, timeout=600)
192
+ def save_column_docs(all_columns):
193
+ with open("/data/dataset/07-dataset-columns.md", "w", encoding="utf-8") as f:
194
+ f.write("# Economy/Labor Dataset Column Documentation\n\n")
195
+ f.write(f"Total Files Processed: {len(all_columns)}\n\n")
196
+ for col_info in all_columns:
197
+ f.write(f"## {col_info['file']}\n")
198
+ f.write(f"- **Rows**: {col_info['row_count']}\n")
199
+ f.write(f"- **Columns**: {', '.join(map(str, col_info['columns']))}\n\n")
200
+ vol_dataset.commit()