NOT-OMEGA commited on
Commit
29bb2d7
Β·
verified Β·
1 Parent(s): a563c79

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +179 -152
main.py CHANGED
@@ -1,152 +1,179 @@
1
- # main.py - SLM Inference Server
2
- from fastapi import FastAPI, HTTPException
3
- from fastapi.middleware.cors import CORSMiddleware
4
- from pydantic import BaseModel
5
- import subprocess
6
- import tiktoken
7
- import os
8
- import time
9
-
10
- app = FastAPI()
11
-
12
- app.add_middleware(
13
- CORSMiddleware,
14
- allow_origins=["*"],
15
- allow_methods=["*"],
16
- allow_headers=["*"],
17
- )
18
-
19
- class GenerateRequest(BaseModel):
20
- prompt: str
21
- max_tokens: int = 100
22
- temperature: float = 0.8
23
- top_k: int = 40
24
-
25
- # Tokenizer setup
26
- try:
27
- enc = tiktoken.get_encoding("gpt2")
28
- print("βœ… Tokenizer loaded successfully.")
29
- except Exception as e:
30
- print(f"❌ Warning: tiktoken not found. Error: {e}")
31
- enc = None
32
-
33
-
34
- @app.get("/health")
35
- async def health_check():
36
- current_dir = os.path.dirname(os.path.abspath(__file__))
37
- exe_path = os.path.join(current_dir, "inference.exe")
38
- model_path = os.path.join(current_dir, "model.bin")
39
-
40
- return {
41
- "status": "ok",
42
- "inference_exe_found": os.path.exists(exe_path),
43
- "model_bin_found": os.path.exists(model_path),
44
- "working_directory": current_dir
45
- }
46
-
47
-
48
- @app.post("/generate")
49
- async def generate_text(req: GenerateRequest):
50
-
51
- # 0. Tokenizer check
52
- if enc is None:
53
- raise HTTPException(
54
- status_code=500,
55
- detail="Tokenizer not loaded. Run: pip install tiktoken"
56
- )
57
-
58
- # 1. Encode prompt
59
- input_tokens = enc.encode(req.prompt)
60
- token_str = ",".join(map(str, input_tokens))
61
-
62
- # 2. Path setup
63
- current_dir = os.path.dirname(os.path.abspath(__file__))
64
- exe_path = os.path.join(current_dir, "inference.exe")
65
- model_path = os.path.join(current_dir, "model.bin")
66
-
67
- print(f"DEBUG: exe -> {exe_path} exists={os.path.exists(exe_path)}")
68
- print(f"DEBUG: model -> {model_path} exists={os.path.exists(model_path)}")
69
-
70
- # 3. File existence checks
71
- if not os.path.exists(exe_path):
72
- raise HTTPException(
73
- status_code=500,
74
- detail=f"inference.exe nahi mili: {exe_path} β€” Pehle C++ compile karo!"
75
- )
76
-
77
- if not os.path.exists(model_path):
78
- raise HTTPException(
79
- status_code=500,
80
- detail=f"model.bin nahi mili: {model_path} β€” Model file same folder mein rakhni hai!"
81
- )
82
-
83
- # 4. Run C++ engine
84
- # FIX: temperature aur top_k ab subprocess ko pass ho rahe hain
85
- try:
86
- start_time = time.perf_counter()
87
-
88
- process = subprocess.run(
89
- [
90
- exe_path,
91
- token_str,
92
- str(req.max_tokens),
93
- str(req.temperature), # <-- FIX: was missing before
94
- str(req.top_k), # <-- FIX: was missing before
95
- ],
96
- capture_output=True,
97
- text=True,
98
- cwd=current_dir
99
- )
100
-
101
- elapsed_ms = (time.perf_counter() - start_time) * 1000
102
-
103
- except Exception as e:
104
- raise HTTPException(status_code=500, detail=f"Execution failed: {str(e)}")
105
-
106
- # 5. Error check
107
- if process.returncode != 0 and not process.stdout.strip():
108
- stdout_msg = process.stdout.strip() if process.stdout else ""
109
- stderr_msg = process.stderr.strip() if process.stderr else ""
110
-
111
- if "ERROR_MODEL_NOT_FOUND" in stdout_msg:
112
- raise HTTPException(status_code=500, detail="model.bin nahi mili! Same folder mein rakho.")
113
- elif "ERROR_ARGS" in stdout_msg:
114
- raise HTTPException(status_code=500, detail="C++ engine ko arguments galat mile.")
115
- else:
116
- raise HTTPException(
117
- status_code=500,
118
- detail=f"C++ Error | stdout: '{stdout_msg}' | stderr: '{stderr_msg}'"
119
- )
120
-
121
- # 6. Decode output token IDs
122
- try:
123
- output_str = process.stdout.strip()
124
-
125
- if not output_str:
126
- generated_ids = []
127
- else:
128
- generated_ids = []
129
- for x in output_str.split():
130
- try:
131
- generated_ids.append(int(x))
132
- except ValueError:
133
- print(f"DEBUG: skipping non-integer token: '{x}'")
134
-
135
- generated_text = enc.decode(generated_ids) if generated_ids else ""
136
-
137
- tokens_out = len(generated_ids)
138
- tokens_per_sec = round(tokens_out / (elapsed_ms / 1000), 2) if elapsed_ms > 0 else 0
139
-
140
- print(f"βœ… Generated {tokens_out} tokens in {elapsed_ms:.2f}ms ({tokens_per_sec} tok/s)")
141
-
142
- return {
143
- "prompt": req.prompt,
144
- "generated_text": generated_text,
145
- "tokens_in": len(input_tokens),
146
- "tokens_out": tokens_out,
147
- "latency_ms": round(elapsed_ms, 2),
148
- "tokens_per_sec": tokens_per_sec
149
- }
150
-
151
- except Exception as e:
152
- raise HTTPException(status_code=500, detail=f"Decoding error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py - SLM Inference Server
2
+ from fastapi import FastAPI, HTTPException
3
+ from fastapi.responses import FileResponse
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from pydantic import BaseModel
6
+ import subprocess
7
+ import tiktoken
8
+ import os
9
+ import time
10
+
11
+ app = FastAPI()
12
+
13
+ app.add_middleware(
14
+ CORSMiddleware,
15
+ allow_origins=["*"],
16
+ allow_methods=["*"],
17
+ allow_headers=["*"],
18
+ )
19
+
20
+ class GenerateRequest(BaseModel):
21
+ prompt: str
22
+ max_tokens: int = 100
23
+ temperature: float = 0.8
24
+ top_k: int = 40
25
+
26
+ # Tokenizer setup
27
+ try:
28
+ enc = tiktoken.get_encoding("gpt2")
29
+ print("βœ… Tokenizer loaded successfully.")
30
+ except Exception as e:
31
+ print(f"❌ Warning: tiktoken not found. Error: {e}")
32
+ enc = None
33
+
34
+
35
+ # βœ… FIX 1: index.html serve karo root pe
36
+ @app.get("/")
37
+ async def root():
38
+ current_dir = os.path.dirname(os.path.abspath(__file__))
39
+ return FileResponse(os.path.join(current_dir, "index.html"))
40
+
41
+
42
+ @app.get("/health")
43
+ async def health_check():
44
+ current_dir = os.path.dirname(os.path.abspath(__file__))
45
+ # βœ… FIX 2: .exe β†’ no extension (Linux binary)
46
+ exe_path = os.path.join(current_dir, "inference")
47
+ model_path = os.path.join(current_dir, "model.bin")
48
+
49
+ return {
50
+ "status": "ok",
51
+ "inference_exe_found": os.path.exists(exe_path),
52
+ "model_bin_found": os.path.exists(model_path),
53
+ "working_directory": current_dir
54
+ }
55
+
56
+
57
+ @app.post("/generate")
58
+ async def generate_text(req: GenerateRequest):
59
+
60
+ # 0. Tokenizer check
61
+ if enc is None:
62
+ raise HTTPException(
63
+ status_code=500,
64
+ detail="Tokenizer not loaded. Run: pip install tiktoken"
65
+ )
66
+
67
+ # 1. Encode prompt
68
+ input_tokens = enc.encode(req.prompt)
69
+ token_str = ",".join(map(str, input_tokens))
70
+
71
+ # 2. Path setup
72
+ current_dir = os.path.dirname(os.path.abspath(__file__))
73
+ # βœ… FIX 3: .exe β†’ no extension (Linux binary)
74
+ exe_path = os.path.join(current_dir, "inference")
75
+ model_path = os.path.join(current_dir, "model.bin")
76
+
77
+ print(f"DEBUG: exe -> {exe_path} exists={os.path.exists(exe_path)}")
78
+ print(f"DEBUG: model -> {model_path} exists={os.path.exists(model_path)}")
79
+
80
+ # 3. File existence checks
81
+ if not os.path.exists(exe_path):
82
+ raise HTTPException(
83
+ status_code=500,
84
+ detail=f"inference binary nahi mili: {exe_path} β€” Dockerfile se compile honi chahiye!"
85
+ )
86
+
87
+ if not os.path.exists(model_path):
88
+ raise HTTPException(
89
+ status_code=500,
90
+ detail=f"model.bin nahi mili: {model_path} β€” Model file same folder mein rakhni hai!"
91
+ )
92
+
93
+ # 4. Run C++ engine
94
+ try:
95
+ start_time = time.perf_counter()
96
+
97
+ process = subprocess.run(
98
+ [
99
+ exe_path,
100
+ token_str,
101
+ str(req.max_tokens),
102
+ str(req.temperature),
103
+ str(req.top_k),
104
+ ],
105
+ capture_output=True,
106
+ text=True,
107
+ cwd=current_dir
108
+ )
109
+
110
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
111
+
112
+ except Exception as e:
113
+ raise HTTPException(status_code=500, detail=f"Execution failed: {str(e)}")
114
+
115
+ # 5. Error check
116
+ if process.returncode != 0 and not process.stdout.strip():
117
+ stdout_msg = process.stdout.strip() if process.stdout else ""
118
+ stderr_msg = process.stderr.strip() if process.stderr else ""
119
+
120
+ if "ERROR_MODEL_NOT_FOUND" in stdout_msg:
121
+ raise HTTPException(status_code=500, detail="model.bin nahi mili! Same folder mein rakho.")
122
+ elif "ERROR_ARGS" in stdout_msg:
123
+ raise HTTPException(status_code=500, detail="C++ engine ko arguments galat mile.")
124
+ else:
125
+ raise HTTPException(
126
+ status_code=500,
127
+ detail=f"C++ Error | stdout: '{stdout_msg}' | stderr: '{stderr_msg}'"
128
+ )
129
+
130
+ # 6. Decode output token IDs
131
+ try:
132
+ output_str = process.stdout.strip()
133
+
134
+ if not output_str:
135
+ generated_ids = []
136
+ else:
137
+ generated_ids = []
138
+ for x in output_str.split():
139
+ try:
140
+ generated_ids.append(int(x))
141
+ except ValueError:
142
+ print(f"DEBUG: skipping non-integer token: '{x}'")
143
+
144
+ generated_text = enc.decode(generated_ids) if generated_ids else ""
145
+
146
+ tokens_out = len(generated_ids)
147
+ tokens_per_sec = round(tokens_out / (elapsed_ms / 1000), 2) if elapsed_ms > 0 else 0
148
+
149
+ print(f"βœ… Generated {tokens_out} tokens in {elapsed_ms:.2f}ms ({tokens_per_sec} tok/s)")
150
+
151
+ return {
152
+ "prompt": req.prompt,
153
+ "generated_text": generated_text,
154
+ "tokens_in": len(input_tokens),
155
+ "tokens_out": tokens_out,
156
+ "latency_ms": round(elapsed_ms, 2),
157
+ "tokens_per_sec": tokens_per_sec
158
+ }
159
+
160
+ except Exception as e:
161
+ raise HTTPException(status_code=500, detail=f"Decoding error: {str(e)}")
162
+ ```
163
+
164
+ ---
165
+
166
+ ## πŸš€ Deployment Steps (Basic se Basic)
167
+
168
+ **Step 1** β€” Hugging Face pe jaao β†’ apna `NOT-OMEGA/Inference` Space kholo
169
+
170
+ **Step 2** β€” `main.py` file pe click karo β†’ Edit button dabao β†’ Pura purana code delete karo β†’ Upar wala naya code paste karo β†’ **Commit changes**
171
+
172
+ **Step 3** β€” Baaki files (Dockerfile, index.html, inference.cpp, requirements.txt) already sahi hain, unhe **mat chhuona**
173
+
174
+ **Step 4** β€” Space automatically rebuild hoga (2-5 minutes lagenge)
175
+
176
+ **Step 5** β€” **Logs** tab check karo β€” ye lines dikhni chahiye:
177
+ ```
178
+ βœ… Tokenizer loaded successfully.
179
+ INFO: Uvicorn running on http://0.0.0.0:7860