KoKoDanio commited on
Commit
b49a99c
·
1 Parent(s): d09a01e

1st commit

Browse files
Files changed (3) hide show
  1. DockerFile +22 -0
  2. app.py +91 -0
  3. requirements.txt +6 -0
DockerFile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a base image with Python
2
+ FROM python:3.10-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file and install dependencies
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ # Install the correct unsloth version
12
+ # This command is crucial for proper GPU setup
13
+ RUN pip install "unsloth[cu121-ampere-gpu-smashed] @ git+https://github.com/unslothai/unsloth.git"
14
+
15
+ # Copy the rest of your application code
16
+ COPY . .
17
+
18
+ # Expose the port your application will run on
19
+ EXPOSE 8000
20
+
21
+ # Command to run your application using Uvicorn
22
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from fastapi import FastAPI
3
+ from pydantic import BaseModel, Field
4
+ from unsloth import FastLanguageModel
5
+ from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList
6
+
7
+ # Initialize FastAPI app
8
+ app = FastAPI(title="Llama-3.1 Finetuned API", version="1.0.0")
9
+
10
+ # --- Model Loading ---
11
+ try:
12
+ lora_adapter_path = "cyber_llama"
13
+ model, tokenizer = FastLanguageModel.from_pretrained(
14
+ model_name=lora_adapter_path,
15
+ max_seq_length=2048,
16
+ load_in_4bit=True,
17
+ )
18
+ FastLanguageModel.for_inference(model)
19
+ except Exception as e:
20
+ print(f"Error loading model: {e}")
21
+ # Set to None to handle errors gracefully in the API endpoint
22
+ model = None
23
+ tokenizer = None
24
+
25
+ # Pydantic model for request body
26
+ class PromptRequest(BaseModel):
27
+ prompt: str = Field(..., description="The user's prompt or instruction for the model.")
28
+ max_new_tokens: int = Field(512, ge=1, description="Maximum number of tokens to generate.")
29
+ stop_sequences: list[str] = Field([".", "!", "?"], description="A list of strings that will stop the generation.")
30
+
31
+ # A custom stopping criteria class for the stop sequences
32
+ class StopOnTokens(StoppingCriteria):
33
+ def __init__(self, stop_token_ids):
34
+ super().__init__()
35
+ self.stop_token_ids = stop_token_ids
36
+
37
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
38
+ return any(input_ids[0][-1] == token_id for token_id in self.stop_token_ids)
39
+
40
+ # API endpoint for text generation
41
+ @app.post("/generate", summary="Generates text based on a given prompt")
42
+ async def generate(request: PromptRequest):
43
+ if not model or not tokenizer:
44
+ return {"error": "Model not loaded. Please check the server logs."}
45
+
46
+ # The prompt template for the model
47
+ alpaca_prompt = """You are a trustworthy cybersecurity and privacy assistant that provides clear, safe, and practical guidance on protecting data, avoiding threats, and staying secure online.
48
+
49
+ ### Instruction:
50
+ Analyse the user input and answer the question carefully. Please try to obey the cybersecurity and privacy laws.
51
+
52
+ ### Input:
53
+ {}
54
+
55
+ ### Response:
56
+ {}"""
57
+
58
+ inputs = tokenizer(
59
+ [
60
+ alpaca_prompt.format(
61
+ request.prompt, # input from the user
62
+ "", # empty response to be filled by the model
63
+ )
64
+ ],
65
+ return_tensors="pt"
66
+ ).to("cuda")
67
+
68
+ # Convert the stop sequences to token IDs
69
+ stop_token_ids = tokenizer.convert_tokens_to_ids(request.stop_sequences)
70
+
71
+ # Create the stopping criteria list
72
+ stopping_criteria = StoppingCriteriaList([StopOnTokens(stop_token_ids)])
73
+
74
+ outputs = model.generate(
75
+ **inputs,
76
+ max_new_tokens=request.max_new_tokens,
77
+ use_cache=True,
78
+ do_sample=True, # Recommended for better creative responses
79
+ stopping_criteria=stopping_criteria
80
+ )
81
+
82
+ # Decode the generated text
83
+ generated_text = tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)[0]
84
+
85
+ return {"generated_text": generated_text}
86
+
87
+ # This section is for local testing and will not be run on Hugging Face Spaces
88
+ if __name__ == "__main__":
89
+ import uvicorn
90
+ # Make sure to include the ngrok setup for local testing on Colab
91
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git@August-2025
4
+ streamlit
5
+ fastapi
6
+ uvicorn