Spaces:

abdelac
/

Mistral_Test

Build error

App Files Files Community

eesfeg commited on Dec 27, 2025

Commit

1e639fb

1 Parent(s): f9cb048

Add application file

Browse files

Files changed (17) hide show

api_fastapi.py +82 -0
app.py +51 -0
app_gradio.py +86 -0
basic_inference.py +20 -0
data.json +10 -0
deepseek_dockerfile_20251226_58f521.dockerfile +17 -0
dockerfile +17 -0
dockerfile.dockerfile +9 -0
inference.py +70 -0
optimization.py +30 -0
optimized_loading.py +20 -0
prepare_model.py +97 -0
push.py +7 -0
requirements.txt +14 -0
train.py +50 -0
upload.py +7 -0
upload_to_hf.py +151 -0

api_fastapi.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# api_fastapi.py
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import uvicorn
+app = FastAPI(title="Mistral API")
+class ChatRequest(BaseModel):
+    prompt: str
+    max_tokens: int = 500
+    temperature: float = 0.7
+# Global model instance
+MODEL = None
+TOKENIZER = None
+@app.on_event("startup")
+async def load_model():
+    global MODEL, TOKENIZER
+    try:
+        TOKENIZER = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+        MODEL = AutoModelForCausalLM.from_pretrained(
+            "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            torch_dtype=torch.float16,
+            device_map="auto",
+            load_in_8bit=True
+        )
+        print("Model loaded successfully!")
+    except Exception as e:
+        print(f"Error loading model: {e}")
+@app.get("/health")
+async def health():
+    return {"status": "healthy", "model_loaded": MODEL is not None}
+@app.post("/chat")
+async def chat_completion(request: ChatRequest):
+    if MODEL is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    try:
+        # Format prompt
+        formatted_prompt = f"[INST] {request.prompt} [/INST]"
+        # Tokenize
+        inputs = TOKENIZER(formatted_prompt, return_tensors="pt").to(MODEL.device)
+        # Generate
+        with torch.no_grad():
+            outputs = MODEL.generate(
+                **inputs,
+                max_new_tokens=request.max_tokens,
+                temperature=request.temperature,
+                do_sample=True,
+                top_p=0.95
+            )
+        # Decode
+        response = TOKENIZER.decode(outputs[0], skip_special_tokens=True)
+        response = response.split("[/INST]")[-1].strip()
+        return {
+            "response": response,
+            "tokens_generated": len(outputs[0]) - len(inputs.input_ids[0])
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/batch_chat")
+async def batch_chat(requests: list[ChatRequest]):
+    """Process multiple prompts at once"""
+    responses = []
+    for req in requests:
+        result = await chat_completion(req)
+        responses.append(result)
+    return {"responses": responses}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
+BASE_MODEL = "abdelac/tinyllama"
+LORA_MODEL = "abdelac/tinyllama-lora"
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+# Load base model (4-bit for low RAM)
+base_model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    load_in_4bit=True,
+    device_map="auto",
+    torch_dtype=torch.float16
+)
+# Load LoRA adapters
+model = PeftModel.from_pretrained(base_model, LORA_MODEL)
+model.eval()
+def chat(prompt, max_tokens=200, temperature=0.7):
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=True,
+            temperature=temperature,
+            top_p=0.9
+        )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+# Gradio UI
+demo = gr.Interface(
+    fn=chat,
+    inputs=[
+        gr.Textbox(lines=4, label="Prompt"),
+        gr.Slider(50, 500, value=200, label="Max tokens"),
+        gr.Slider(0.1, 1.5, value=0.7, label="Temperature")
+    ],
+    outputs="text",
+    title="TinyLlama Fine-Tuned (LoRA)",
+    description="TinyLlama loaded with LoRA adapters for domain-specific inference"
+)
+demo.launch()

app_gradio.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# app_gradio.py
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+class MistralApp:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = None
+        self.tokenizer = None
+    def load_model(self):
+        if self.model is None:
+            print("Loading model...")
+            self.tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+                torch_dtype=torch.float16,
+                device_map="auto",
+                load_in_8bit=True  # Reduce memory usage
+            )
+            print("Model loaded!")
+        return "Model loaded successfully!"
+    def respond(self, message, history):
+        # Format chat history
+        formatted_prompt = self.format_chat(message, history)
+        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=512,
+                temperature=0.7,
+                do_sample=True
+            )
+        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return response.split("[/INST]")[-1].strip()
+    def format_chat(self, message, history):
+        prompt = ""
+        for user_msg, assistant_msg in history:
+            prompt += f"[INST] {user_msg} [/INST] {assistant_msg} "
+        prompt += f"[INST] {message} [/INST]"
+        return prompt
+# Create Gradio interface
+app = MistralApp()
+with gr.Blocks(title="Mistral Chat Assistant") as demo:
+    gr.Markdown("# 🤖 Mistral 7B Chat Assistant")
+    with gr.Row():
+        with gr.Column(scale=1):
+            load_btn = gr.Button("Load Model", variant="primary")
+            status = gr.Textbox(label="Status", interactive=False)
+        with gr.Column(scale=4):
+            chatbot = gr.Chatbot(height=500)
+            msg = gr.Textbox(label="Your message", placeholder="Type your message here...")
+            send_btn = gr.Button("Send", variant="primary")
+            clear_btn = gr.Button("Clear")
+    # Connect events
+    load_btn.click(app.load_model, outputs=status)
+    def user(user_message, history):
+        return "", history + [[user_message, None]]
+    def bot(history):
+        response = app.respond(history[-1][0], history[:-1])
+        history[-1][1] = response
+        return history
+    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot, chatbot, chatbot
+    )
+    send_btn.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot, chatbot, chatbot
+    )
+    clear_btn.click(lambda: None, None, chatbot, queue=False)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

basic_inference.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# basic_inference.py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# Load from local directory or Hugging Face
+model_path = "./tinyllama"  # or "mistralai/Mistral-7B-Instruct-v0.1"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+    device_map="auto" if device == "cuda" else None,
+)
+# Move to device if not using device_map
+if device == "cuda":
+    model = model.to(device)

data.json ADDED Viewed

	@@ -0,0 +1,10 @@

+[
+  {
+    "instruction": "Explain fake news",
+    "output": "Fake news is false or misleading information presented as news."
+  },
+  {
+    "instruction": "What is AI?",
+    "output": "Artificial Intelligence is the simulation of human intelligence by machines."
+  }
+]

deepseek_dockerfile_20251226_58f521.dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+# Dockerfile
+FROM python:3.9-slim
+WORKDIR /app
+# Install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application
+COPY . .
+# Expose port
+EXPOSE 8000
+# Run the application
+CMD ["python", "api_fastapi.py"]

dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+# Dockerfile
+FROM python:3.9-slim
+WORKDIR /app
+# Install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application
+COPY . .
+# Expose port
+EXPOSE 8000
+# Run the application
+CMD ["python", "api_fastapi.py"]

dockerfile.dockerfile ADDED Viewed

	@@ -0,0 +1,9 @@

+# requirements.txt
+torch==2.1.0
+transformers==4.35.0
+accelerate==0.24.1
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+gradio==4.8.0
+sentencepiece==0.1.99
+bitsandbytes==0.41.1

inference.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# inference.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+class MistralChat:
+    def __init__(self, model_path="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print("Loading model...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+            device_map="auto" if self.device == "cuda" else None,
+            trust_remote_code=True
+        )
+        if self.device == "cuda":
+            self.model = self.model.to(self.device)
+        print("Model loaded successfully!")
+    def generate(self, prompt, max_length=500, temperature=0.7):
+        # Format for instruct models
+        formatted_prompt = f"[INST] {prompt} [/INST]"
+        inputs = self.tokenizer(formatted_prompt, return_tensors="pt")
+        if self.device == "cuda":
+            inputs = inputs.to(self.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=max_length,
+                temperature=temperature,
+                do_sample=True,
+                top_p=0.95,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the assistant's response
+        if "[/INST]" in response:
+            response = response.split("[/INST]")[1].strip()
+        return response
+    def chat_stream(self, prompt):
+        """Stream the response token by token"""
+        formatted_prompt = f"[INST] {prompt} [/INST]"
+        inputs = self.tokenizer(formatted_prompt, return_tensors="pt")
+        streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
+        if self.device == "cuda":
+            inputs = inputs.to(self.device)
+        _ = self.model.generate(**inputs, streamer=streamer, max_new_tokens=500)
+# Usage
+if __name__ == "__main__":
+    chat = MistralChat()
+    # Single response
+    response = chat.generate("Explain quantum computing in simple terms")
+    print("Response:", response)
+    # Streaming response
+    print("\nStreaming response:")
+    chat.chat_stream("Write a short poem about AI")

optimization.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# optimization.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+# 1. Use pipeline for simplicity
+pipe = pipeline(
+    "text-generation",
+    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    model_kwargs={
+        "torch_dtype": torch.float16,
+        "device_map": "auto",
+        "load_in_4bit": True
+    },
+    tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+)
+# 2. Use vLLM for high-throughput (install: pip install vLLM)
+from vllm import LLM, SamplingParams
+llm = LLM(model="mTinyLlama/TinyLlama-1.1B-Chat-v1.0")
+sampling_params = SamplingParams(temperature=0.7, max_tokens=500)
+outputs = llm.generate(["Hello, how are you?"], sampling_params)
+# 3. Cache model responses
+import hashlib
+from functools import lru_cache
+@lru_cache(maxsize=1000)
+def cached_generation(prompt, max_tokens=500):
+    return pipe(prompt, max_new_tokens=max_tokens)[0]['generated_text']

optimized_loading.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# optimized_loading.py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+from accelerate import infer_auto_device_map
+# 4-bit quantization (reduces memory by 75%)
+model = AutoModelForCausalLM.from_pretrained(
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    load_in_4bit=True,  # or load_in_8bit=True for 8-bit
+    device_map="auto",
+    torch_dtype=torch.float16,
+)
+# CPU offloading (for low RAM)
+model = AutoModelForCausalLM.from_pretrained(
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    device_map="auto",
+    offload_folder="offload",
+    offload_state_dict=True,
+)

prepare_model.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# prepare_model.py
+import os
+import json
+import shutil
+from pathlib import Path
+def create_minimal_model_structure(model_path="."):
+    """
+    Create minimal required files for Hugging Face model upload
+    """
+    # Create directories if they don't exist
+    os.makedirs(model_path, exist_ok=True)
+    # 1. Check for model files
+    model_files = list(Path(model_path).glob("*.safetensors")) + \
+                  list(Path(model_path).glob("*.bin")) + \
+                  list(Path(model_path).glob("pytorch_model*.bin"))
+    if not model_files:
+        print("⚠️  Warning: No model weight files found!")
+        print("   Expected: *.safetensors, *.bin, or pytorch_model*.bin")
+    # 2. Create config.json if missing
+    config_path = Path(model_path) / "config.json"
+    if not config_path.exists():
+        print("📝 Creating minimal config.json...")
+        config = {
+            "_name_or_path": "abdelac/Mistral_Test",
+            "architectures": ["MistralForCausalLM"],  # Adjust based on your model
+            "model_type": "mistral",
+            "torch_dtype": "float16",
+            "transformers_version": "4.35.0"
+        }
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+    # 3. Create tokenizer files if missing
+    tokenizer_config_path = Path(model_path) / "tokenizer_config.json"
+    if not tokenizer_config_path.exists():
+        print("📝 Creating tokenizer_config.json...")
+        tokenizer_config = {
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "model_max_length": 32768,
+            "clean_up_tokenization_spaces": False
+        }
+        with open(tokenizer_config_path, "w") as f:
+            json.dump(tokenizer_config, f, indent=2)
+    # 4. Create special_tokens_map.json
+    special_tokens_path = Path(model_path) / "special_tokens_map.json"
+    if not special_tokens_path.exists():
+        print("📝 Creating special_tokens_map.json...")
+        special_tokens = {
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>"
+        }
+        with open(special_tokens_path, "w") as f:
+            json.dump(special_tokens, f, indent=2)
+    # 5. Create README.md
+    readme_path = Path(model_path) / "README.md"
+    if not readme_path.exists():
+        print("📝 Creating README.md...")
+        readme_content = """---
+language:
+- en
+license: apache-2.0
+tags:
+- generated_from_trainer
+- mistral
+- text-generation
+---
+# Model Card
+## Model Description
+This model is a fine-tuned version of Mistral.
+## Usage
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained("abdelac/Mistral_Test")
+tokenizer = AutoTokenizer.from_pretrained("abdelac/Mistral_Test")
+prompt = "Explain machine learning"
+inputs = tokenizer(prompt, return_tensors="pt")
+outputs = model.generate(**inputs, max_new_tokens=100)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))

push.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from huggingface_hub import upload_folder
+upload_folder(
+    folder_path="./lora-out",
+    repo_id="abdelac/tinyllama-lora",
+    repo_type="model"
+)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+# requirements.txt
+uvicorn[standard]==0.24.0
+sentencepiece==0.1.99
+bitsandbytes==0.41.1
+torch
+transformers
+datasets
+peft
+accelerate
+bitsandbytes
+gradio

train.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
+from peft import LoraConfig, get_peft_model
+MODEL_ID = "abdelac/tinyllama"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    load_in_4bit=True,
+    device_map="auto"
+)
+dataset = load_dataset("json", data_files="data.json")["train"]
+def tokenize(example):
+    text = f"### Instruction:\n{example['instruction']}\n### Response:\n{example['output']}"
+    return tokenizer(text, truncation=True, padding="max_length", max_length=512)
+dataset = dataset.map(tokenize)
+lora_config = LoraConfig(
+    r=8,
+    lora_alpha=16,
+    target_modules=["q_proj", "v_proj"],
+    lora_dropout=0.05,
+    task_type="CAUSAL_LM"
+)
+model = get_peft_model(model, lora_config)
+training_args = TrainingArguments(
+    output_dir="./lora-out",
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,
+    num_train_epochs=2,
+    fp16=True,
+    logging_steps=10,
+    save_strategy="epoch"
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset
+)
+trainer.train()
+model.save_pretrained("./lora-out")

upload.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from huggingface_hub import upload_folder
+upload_folder(
+    folder_path="./tinyllama",     # local model directory
+    repo_id="abdelac/tinyllama",
+    repo_type="model"
+)

upload_to_hf.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# upload_to_hf.py
+import os
+from huggingface_hub import HfApi, upload_folder, create_repo, login
+from pathlib import Path
+import shutil
+def upload_model_folder(
+    folder_path=".tinyllama",
+    repo_id="abdelac/tinyllama",
+    repo_type="model",
+    token=None,
+    private=False,
+    commit_message="Upload model files"
+):
+    """
+    Upload a folder to Hugging Face Hub
+    Args:
+        folder_path: Path to local folder to upload
+        repo_id: Hugging Face repository ID (username/repo-name)
+        repo_type: Type of repository ('model', 'dataset', 'space')
+        token: Hugging Face token (optional, will prompt if not provided)
+        private: Whether repository should be private
+        commit_message: Commit message for the upload
+    """
+    # Check if folder exists
+    if not os.path.exists(folder_path):
+        print(f"❌ Error: Folder '{folder_path}' does not exist!")
+        return False
+    # Login to Hugging Face
+    try:
+        login(token=token)
+        print("✅ Logged in to Hugging Face")
+    except Exception as e:
+        print(f"❌ Login failed: {e}")
+        return False
+    # Check repository exists, create if not
+    api = HfApi()
+    try:
+        # Try to get repo info
+        repo_info = api.repo_info(repo_id=repo_id, repo_type=repo_type)
+        print(f"✅ Repository exists: {repo_info.id}")
+    except Exception:
+        # Repository doesn't exist, create it
+        print(f"📦 Creating new repository: {repo_id}")
+        try:
+            api.create_repo(
+                repo_id=repo_id,
+                repo_type=repo_type,
+                private=private,
+                exist_ok=True
+            )
+            print(f"✅ Repository created successfully!")
+        except Exception as e:
+            print(f"❌ Failed to create repository: {e}")
+            return False
+    # Upload the folder
+    print(f"🚀 Uploading folder '{folder_path}' to '{repo_id}'...")
+    try:
+        # Method 1: Using upload_folder (recommended)
+        upload_folder(
+            folder_path=folder_path,
+            repo_id=repo_id,
+            repo_type=repo_type,
+            commit_message=commit_message,
+            commit_description=f"Upload model files from {folder_path}"
+        )
+        print(f"✅ Successfully uploaded to: https://huggingface.co/{repo_id}")
+        return True
+    except Exception as e:
+        print(f"❌ Upload failed: {e}")
+        # Fallback method using HfApi
+        try:
+            print("🔄 Trying alternative upload method...")
+            api.upload_folder(
+                folder_path=folder_path,
+                repo_id=repo_id,
+                repo_type=repo_type,
+                commit_message=commit_message
+            )
+            print(f"✅ Alternative method succeeded!")
+            return True
+        except Exception as e2:
+            print(f"❌ Alternative method also failed: {e2}")
+            return False
+# Alternative: Upload specific files only
+def upload_model_files(
+    local_dir=".tinyllama",
+    repo_id="abdelac/tinyllama",
+    ignore_patterns=None
+):
+    """
+    Upload specific model files with filtering
+    """
+    from huggingface_hub import HfApi
+    api = HfApi()
+    # Upload each file individually (for more control)
+    for root, dirs, files in os.walk(local_dir):
+        for file in files:
+            # Skip files matching ignore patterns
+            if ignore_patterns:
+                skip = False
+                for pattern in ignore_patterns:
+                    if file.endswith(pattern):
+                        skip = True
+                        break
+                if skip:
+                    continue
+            file_path = os.path.join(root, file)
+            # Get relative path for HF
+            rel_path = os.path.relpath(file_path, local_dir)
+            try:
+                api.upload_file(
+                    path_or_fileobj=file_path,
+                    path_in_repo=rel_path,
+                    repo_id=repo_id,
+                    repo_type="model"
+                )
+                print(f"📤 Uploaded: {rel_path}")
+            except Exception as e:
+                print(f"❌ Failed to upload {rel_path}: {e}")
+# Example usage
+if __name__ == "__main__":
+    # Example 1: Simple upload
+    upload_model_folder(
+        folder_path="./my_model",  # Your model folder
+        repo_id="abdelac/tinyllama",
+        repo_type="model",
+        private=False,
+        commit_message="Initial model upload"
+    )
+    # Example 2: Upload current directory
+    # upload_model_folder(
+    #     folder_path=".",
+    #     repo_id="abdelac/Mistral_Test",
+    #     repo_type="model"
+    # )