Spaces:

vinay0123
/

travis_response_2

Sleeping

App Files Files Community

vinay0123 commited on May 29, 2025

Commit

cd9d203

verified ·

1 Parent(s): 5154835

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -45

app.py CHANGED Viewed

@@ -13,9 +13,6 @@ import json
 torch.set_num_threads(os.cpu_count())
 torch.set_num_interop_threads(os.cpu_count())
-# Enable optimizations
-torch.backends.mkldnn.enabled = True if hasattr(torch.backends, 'mkldnn') else False
 url = "https://drive.google.com/uc?id=1RCZShB5ohy1HdU-mogcP16TbeVv9txpY"
 df = pd.read_csv(url)
@@ -76,9 +73,6 @@ def load_model(model, path="gpt_model.pth"):
     if os.path.exists(path):
         model.load_state_dict(torch.load(path, map_location=device, weights_only=True))
         model.eval()
-        # Enable inference optimizations
-        if hasattr(torch.jit, 'optimize_for_inference'):
-            model = torch.jit.optimize_for_inference(torch.jit.script(model))
         print("Model loaded successfully.")
     else:
         print("Model file not found!")
@@ -92,42 +86,30 @@ def generate_response_stream(model, query, max_length=200):
     src = torch.tensor(src_tokens).unsqueeze(0).to(device)
     tgt = torch.tensor([[1]], dtype=torch.long).to(device)  # < SOS >
-    # Pre-allocate tensor for better memory efficiency
-    max_tgt_len = min(max_length, 200)
     with torch.no_grad():
-        # Use torch.inference_mode for better performance
-        with torch.inference_mode():
-            for step in range(max_length):
-                # Forward pass
-                output = model(src, tgt)
-                # Get next token more efficiently
-                logits = output[:, -1, :]
-                next_token = torch.argmax(logits, dim=-1, keepdim=True)
-                # Check for EOS early
-                if next_token.item() == 2:  # <EOS>
-                    break
-                # Concatenate token
-                tgt = torch.cat([tgt, next_token], dim=1)
-                # Get the current word
-                current_word = tokenizer.idx2word.get(next_token.item(), "<UNK>")
-                if current_word not in ["<PAD>", "<EOS>", "< SOS >"]:
-                    yield current_word + " "
-                # Prevent infinite loops
-                if tgt.size(1) >= max_tgt_len:
-                    break
-# Flask App with threading optimizations
 app = Flask(__name__)
-# Configure Flask for better performance
-app.config['THREADED'] = True
 @app.route("/")
 def home():
     return {"message": "Streaming Transformer-based Response Generator API is running!"}
@@ -160,20 +142,18 @@ def query_model():
         mimetype='text/event-stream',
         headers={
             'Cache-Control': 'no-cache',
-            'Connection': 'keep-alive',
-            'X-Accel-Buffering': 'no'  # Disable nginx buffering if present
         }
     )
 if __name__ == "__main__":
-    # Load and optimize model
     model = load_model(model)
-    # Run Flask with threading enabled and optimized worker settings
     app.run(
         host="0.0.0.0",
         port=7860,
         threaded=True,
-        processes=1,  # Use threading instead of multiprocessing for better memory sharing
-        debug=False   # Disable debug mode for better performance
     )

 torch.set_num_threads(os.cpu_count())
 torch.set_num_interop_threads(os.cpu_count())
 url = "https://drive.google.com/uc?id=1RCZShB5ohy1HdU-mogcP16TbeVv9txpY"
 df = pd.read_csv(url)
     if os.path.exists(path):
         model.load_state_dict(torch.load(path, map_location=device, weights_only=True))
         model.eval()
         print("Model loaded successfully.")
     else:
         print("Model file not found!")
     src = torch.tensor(src_tokens).unsqueeze(0).to(device)
     tgt = torch.tensor([[1]], dtype=torch.long).to(device)  # < SOS >
     with torch.no_grad():
+        for step in range(max_length):
+            # Forward pass
+            output = model(src, tgt)
+            # Get next token more efficiently
+            logits = output[:, -1, :]
+            next_token = torch.argmax(logits, dim=-1, keepdim=True)
+            # Check for EOS early
+            if next_token.item() == 2:  # <EOS>
+                break
+            # Concatenate token
+            tgt = torch.cat([tgt, next_token], dim=1)
+            # Get the current word
+            current_word = tokenizer.idx2word.get(next_token.item(), "<UNK>")
+            if current_word not in ["<PAD>", "<EOS>", "< SOS >"]:
+                yield current_word + " "
+# Flask App
 app = Flask(__name__)
 @app.route("/")
 def home():
     return {"message": "Streaming Transformer-based Response Generator API is running!"}
         mimetype='text/event-stream',
         headers={
             'Cache-Control': 'no-cache',
+            'Connection': 'keep-alive'
         }
     )
 if __name__ == "__main__":
+    # Load model
     model = load_model(model)
+    # Run Flask with optimizations
     app.run(
         host="0.0.0.0",
         port=7860,
         threaded=True,
+        debug=False
     )