Instructions to use girish00/ConicAI_LLM_model with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use girish00/ConicAI_LLM_model with PEFT:

from peft import PeftModel
from transformers import AutoModelForCausalLM

base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-0.5B-Instruct")
model = PeftModel.from_pretrained(base_model, "girish00/ConicAI_LLM_model")

Transformers

How to use girish00/ConicAI_LLM_model with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="girish00/ConicAI_LLM_model")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("girish00/ConicAI_LLM_model")
model = AutoModelForCausalLM.from_pretrained("girish00/ConicAI_LLM_model")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use girish00/ConicAI_LLM_model with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "girish00/ConicAI_LLM_model"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "girish00/ConicAI_LLM_model",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/girish00/ConicAI_LLM_model

SGLang

How to use girish00/ConicAI_LLM_model with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "girish00/ConicAI_LLM_model" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "girish00/ConicAI_LLM_model",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "girish00/ConicAI_LLM_model" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "girish00/ConicAI_LLM_model",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use girish00/ConicAI_LLM_model with Docker Model Runner:
```
docker model run hf.co/girish00/ConicAI_LLM_model
```

girish00 commited on Apr 19

Commit

dc14a91

verified ·

1 Parent(s): 7e079b1

add dedicated endpoint cloud mode

Browse files

Files changed (1) hide show

infer_cloud.py +53 -15

infer_cloud.py CHANGED Viewed

@@ -59,6 +59,29 @@ def call_direct_inference_api(repo_id, token, prompt_text, generation_kwargs):
     return body
 def run_local_fallback(args, reason):
     if not args.fallback_model_path:
         raise RuntimeError(reason)
@@ -102,7 +125,13 @@ def run_local_fallback(args, reason):
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--repo-id", type=str, required=True)
     parser.add_argument("--prompt", type=str, required=True)
     parser.add_argument("--token", type=str, default=os.getenv("HF_TOKEN"))
     parser.add_argument(
@@ -123,9 +152,10 @@ def main():
     args = parser.parse_args()
     if args.no_local_fallback:
         args.fallback_model_path = ""
     token = args.token or get_token()
-    client = InferenceClient(model=args.repo_id, token=token)
     prompt_text = build_instruction_prompt(args.prompt)
     generation_kwargs = {
@@ -139,26 +169,34 @@ def main():
         generation_kwargs["temperature"] = 0.01
     start_time = time.perf_counter()
-    try:
-        response = client.text_generation(prompt_text, **generation_kwargs)
-    except TypeError:
-        generation_kwargs.pop("return_full_text", None)
         try:
             response = client.text_generation(prompt_text, **generation_kwargs)
         except Exception as exc:
             try:
-                response = call_direct_inference_api(
-                    args.repo_id, token, prompt_text, generation_kwargs
-                )
             except Exception as direct_exc:
                 run_local_fallback(args, f"{exc}; direct API fallback failed: {direct_exc}")
                 return
-    except Exception as exc:
-        try:
-            response = call_direct_inference_api(args.repo_id, token, prompt_text, generation_kwargs)
-        except Exception as direct_exc:
-            run_local_fallback(args, f"{exc}; direct API fallback failed: {direct_exc}")
-            return
     latency_ms = int((time.perf_counter() - start_time) * 1000)
     generated_text = normalize_hf_response(response).strip()

     return body
+def call_endpoint_url(endpoint_url, token, prompt_text, generation_kwargs):
+    headers = {"Content-Type": "application/json"}
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    payload = {
+        "inputs": prompt_text,
+        "parameters": generation_kwargs,
+        "options": {"wait_for_model": True},
+    }
+    response = requests.post(endpoint_url, headers=headers, json=payload, timeout=180)
+    try:
+        body = response.json()
+    except ValueError:
+        body = response.text
+    if response.status_code >= 400:
+        raise RuntimeError(f"Endpoint API error {response.status_code}: {body}")
+    if isinstance(body, dict) and body.get("error"):
+        raise RuntimeError(f"Endpoint API error: {body['error']}")
+    return body
 def run_local_fallback(args, reason):
     if not args.fallback_model_path:
         raise RuntimeError(reason)
 def main():
     parser = argparse.ArgumentParser()
+    parser.add_argument("--repo-id", type=str, default="")
+    parser.add_argument(
+        "--endpoint-url",
+        type=str,
+        default=os.getenv("HF_ENDPOINT_URL", ""),
+        help="Dedicated inference endpoint URL. Use this for true cloud inference.",
+    )
     parser.add_argument("--prompt", type=str, required=True)
     parser.add_argument("--token", type=str, default=os.getenv("HF_TOKEN"))
     parser.add_argument(
     args = parser.parse_args()
     if args.no_local_fallback:
         args.fallback_model_path = ""
+    if not args.repo_id and not args.endpoint_url:
+        raise ValueError("Pass --repo-id or --endpoint-url.")
     token = args.token or get_token()
     prompt_text = build_instruction_prompt(args.prompt)
     generation_kwargs = {
         generation_kwargs["temperature"] = 0.01
     start_time = time.perf_counter()
+    if args.endpoint_url:
+        try:
+            response = call_endpoint_url(args.endpoint_url, token, prompt_text, generation_kwargs)
+        except Exception as exc:
+            run_local_fallback(args, str(exc))
+            return
+    else:
+        client = InferenceClient(model=args.repo_id, token=token)
         try:
             response = client.text_generation(prompt_text, **generation_kwargs)
+        except TypeError:
+            generation_kwargs.pop("return_full_text", None)
+            try:
+                response = client.text_generation(prompt_text, **generation_kwargs)
+            except Exception as exc:
+                try:
+                    response = call_direct_inference_api(
+                        args.repo_id, token, prompt_text, generation_kwargs
+                    )
+                except Exception as direct_exc:
+                    run_local_fallback(args, f"{exc}; direct API fallback failed: {direct_exc}")
+                    return
         except Exception as exc:
             try:
+                response = call_direct_inference_api(args.repo_id, token, prompt_text, generation_kwargs)
             except Exception as direct_exc:
                 run_local_fallback(args, f"{exc}; direct API fallback failed: {direct_exc}")
                 return
     latency_ms = int((time.perf_counter() - start_time) * 1000)
     generated_text = normalize_hf_response(response).strip()