Instructions to use girish00/ConicAI_LLM_model with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use girish00/ConicAI_LLM_model with PEFT:

from peft import PeftModel
from transformers import AutoModelForCausalLM

base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-0.5B-Instruct")
model = PeftModel.from_pretrained(base_model, "girish00/ConicAI_LLM_model")

Transformers

How to use girish00/ConicAI_LLM_model with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="girish00/ConicAI_LLM_model")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("girish00/ConicAI_LLM_model")
model = AutoModelForCausalLM.from_pretrained("girish00/ConicAI_LLM_model")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use girish00/ConicAI_LLM_model with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "girish00/ConicAI_LLM_model"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "girish00/ConicAI_LLM_model",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/girish00/ConicAI_LLM_model

SGLang

How to use girish00/ConicAI_LLM_model with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "girish00/ConicAI_LLM_model" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "girish00/ConicAI_LLM_model",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "girish00/ConicAI_LLM_model" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "girish00/ConicAI_LLM_model",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use girish00/ConicAI_LLM_model with Docker Model Runner:
```
docker model run hf.co/girish00/ConicAI_LLM_model
```

ConicAI_LLM_model / infer_local.py

girish00

update endpoint helper files

0b49288 verified about 1 month ago

raw

history blame contribute delete

18.6 kB

	import argparse
	import ast
	import json
	import os
	import re
	import sys
	import time


	def build_instruction_prompt(user_prompt):
	return (
	"You are a coding assistant. Return ONLY valid JSON with this exact schema:\n"
	"{\n"
	' "code": "string",\n'
	' "explanation": "string"\n'
	"}\n"
	"Rules:\n"
	"- code must be practical, runnable, and directly answer the prompt.\n"
	"- explanation must clearly explain the code and key decisions.\n"
	"- no markdown fences, no extra keys, no additional text.\n\n"
	f"User prompt: {user_prompt}\n"
	"JSON:"
	)


	def extract_first_json_object(text):
	start = text.find("{")
	if start == -1:
	return None
	depth = 0
	for idx in range(start, len(text)):
	ch = text[idx]
	if ch == "{":
	depth += 1
	elif ch == "}":
	depth -= 1
	if depth == 0:
	candidate = text[start : idx + 1]
	try:
	return json.loads(candidate)
	except json.JSONDecodeError:
	return None
	return None


	def extract_markdown_code(text):
	match = re.search(r"```(?:python\|py)?\s(.?)```", text, flags=re.DOTALL \| re.IGNORECASE)
	if match:
	return match.group(1).strip()
	return ""


	def extract_fix_prompt_code(prompt):
	match = re.search(
	r"\b(?:fix\|debug\|repair)\s+this\s+code\s:\s(.+)$",
	prompt.strip(),
	flags=re.IGNORECASE \| re.DOTALL,
	)
	if match:
	return match.group(1).strip()
	return ""


	def fallback_parse_response(text, prompt=""):
	cleaned = text.strip()
	code = extract_markdown_code(cleaned) or extract_fix_prompt_code(prompt) or cleaned
	explanation = "Generated response from the model."

	if "Explanation:" in cleaned:
	parts = cleaned.split("Explanation:", 1)
	code = extract_markdown_code(parts[0]) or parts[0].strip()
	explanation = parts[1].strip() or explanation
	elif extract_markdown_code(cleaned):
	explanation = "Extracted the Python code block from the model response."

	return {"code": code, "explanation": explanation}


	def safe_float(value):
	try:
	return float(value)
	except (TypeError, ValueError):
	return 0.0


	def compute_relevancy_score(prompt, code, explanation):
	words_pattern = r"[A-Za-z_][A-Za-z0-9_]+"
	prompt_tokens = set(re.findall(words_pattern, prompt.lower()))
	answer_tokens = set(re.findall(words_pattern, f"{code}\n{explanation}".lower()))

	if not prompt_tokens:
	return 0.0
	overlap = len(prompt_tokens & answer_tokens)
	score = overlap / len(prompt_tokens)
	return round(max(0.0, min(1.0, score)), 4)


	def looks_python_like(code):
	python_like = any(
	marker in code
	for marker in ("def ", "import ", "class ", "print(", "return ", "for ", "if ")
	)
	return python_like


	def prompt_expects_code(prompt):
	prompt_l = prompt.lower()
	intent_markers = (
	"fix",
	"debug",
	"repair",
	"write",
	"create",
	"generate",
	"implement",
	"function",
	"code",
	"snippet",
	"python",
	"multiply",
	"multiplication",
	"product",
	"add",
	"addition",
	"sum",
	"subtract",
	"subtraction",
	"difference",
	"divide",
	"division",
	"quotient",
	)
	return any(marker in prompt_l for marker in intent_markers)


	def check_hallucination(code, prompt=""):
	python_like = looks_python_like(code)
	if prompt_expects_code(prompt) and not python_like:
	return True, "Expected Python code, but output does not look like Python code."

	if not python_like:
	return False, "No Python syntax check required for this output."

	try:
	ast.parse(code)
	return False, "Python syntax check passed."
	except SyntaxError as exc:
	return True, f"Syntax error: {exc}"


	def repair_common_python_issues(code):
	fixed = code.strip()
	if not fixed:
	return fixed

	# Fix common "def ... return ..." one-line syntax issue.
	fixed = re.sub(
	r"^def\s+([A-Za-z_]\w)$(.?)$\s+return\s+(.+)$",
	r"def \1(\2):\n return \3",
	fixed,
	flags=re.MULTILINE,
	)
	# Fix assignment in conditional checks.
	fixed = re.sub(r"\bif\s+([A-Za-z_]\w)\s=\s*([^:]+):", r"if \1 == \2:", fixed)
	# Fix missing colon in for loops.
	fixed = re.sub(r"^(for\s+.+\))\s*$", r"\1:", fixed, flags=re.MULTILINE)
	return fixed


	def synthesize_common_solution(prompt):
	prompt_l = prompt.lower()
	prompt_code = extract_fix_prompt_code(prompt)

	repaired = repair_common_python_issues(prompt_code)
	if repaired and looks_python_like(repaired):
	hallucination, _ = check_hallucination(repaired, prompt=prompt)
	if not hallucination:
	return (
	repaired,
	"Auto-repair applied for common Python syntax issues detected in the prompt.",
	)

	operations = [
	(
	("multiply", "multiplication", "product"),
	"multiply",
	"*",
	"multiplies two numbers",
	),
	(
	("add", "addition", "sum"),
	"add",
	"+",
	"adds two numbers",
	),
	(
	("subtract", "subtraction", "difference"),
	"subtract",
	"-",
	"subtracts the second number from the first",
	),
	(
	("divide", "division", "quotient"),
	"divide",
	"/",
	"divides the first number by the second",
	),
	]
	for keywords, name, operator, description in operations:
	if any(keyword in prompt_l for keyword in keywords):
	return (
	f"def {name}(a, b):\n return a {operator} b",
	f"This function {description} and returns the result.",
	)

	return "", ""


	def maybe_apply_task_fallback(prompt, code, explanation, hallucination):
	prompt_l = prompt.lower()
	patched_code = code
	patched_explanation = explanation

	if hallucination and ("fix" in prompt_l or "debug" in prompt_l):
	repaired = repair_common_python_issues(code)
	if repaired and repaired != code:
	patched_code = repaired
	patched_explanation = (
	explanation
	+ " Auto-repair applied for common Python syntax issues detected in generated code."
	).strip()

	if "linear regression" in prompt_l:
	if len(patched_code.strip()) < 60 or "LinearRegression" not in patched_code:
	patched_code = (
	"import numpy as np\n"
	"from sklearn.linear_model import LinearRegression\n"
	"from sklearn.metrics import mean_squared_error, r2_score\n\n"
	"X = np.array([[1], [2], [3], [4], [5]])\n"
	"y = np.array([2, 4, 6, 8, 10])\n\n"
	"model = LinearRegression()\n"
	"model.fit(X, y)\n"
	"predictions = model.predict(X)\n\n"
	"mse = mean_squared_error(y, predictions)\n"
	"r2 = r2_score(y, predictions)\n\n"
	"print('Coefficients:', model.coef_)\n"
	"print('Intercept:', model.intercept_)\n"
	"print('Mean Squared Error (MSE):', mse)\n"
	"print('R-squared Score:', r2)"
	)
	patched_explanation = (
	"This creates and trains a Linear Regression model on sample data, then "
	"evaluates it using MSE and R-squared. It prints learned coefficients, "
	"intercept, and performance metrics."
	)

	return patched_code, patched_explanation


	def extract_important_tokens(tokenizer, generated_ids, token_confidences, limit=5):
	if not generated_ids or not token_confidences:
	return []
	pairs = list(zip(generated_ids, token_confidences))
	pairs.sort(key=lambda x: x[1], reverse=True)
	top_ids = [token_id for token_id, _ in pairs[:limit]]
	decoded = [tokenizer.decode([tid]) for tid in top_ids]
	return [tok for tok in decoded if tok.strip()][:limit]


	def build_structured_result(
	prompt,
	generated_text,
	latency_ms,
	tokenizer=None,
	generated_ids=None,
	token_confidences=None,
	default_confidence=0.0,
	):
	parsed = extract_first_json_object(generated_text)
	if parsed is None:
	parsed = fallback_parse_response(generated_text, prompt=prompt)

	code = str(parsed.get("code", "")).strip()
	explanation = str(parsed.get("explanation", "")).strip()
	if not code:
	code = extract_fix_prompt_code(prompt) or generated_text
	if not explanation:
	explanation = "Model did not provide a clear explanation."

	hallucination, hallucination_reason = check_hallucination(code, prompt=prompt)
	code, explanation = maybe_apply_task_fallback(prompt, code, explanation, hallucination)
	hallucination, hallucination_reason = check_hallucination(code, prompt=prompt)

	if hallucination and ("fix" in prompt.lower() or "debug" in prompt.lower()):
	prompt_code = extract_fix_prompt_code(prompt)
	repaired = repair_common_python_issues(prompt_code)
	if repaired and repaired != code:
	prompt_hallucination, prompt_reason = check_hallucination(repaired, prompt=prompt)
	if not prompt_hallucination:
	code = repaired
	explanation = (
	"This fixes the Python syntax by adding the missing colon after the "
	"function definition and indenting the return statement."
	)
	hallucination = False
	hallucination_reason = prompt_reason

	if hallucination or (
	prompt_expects_code(prompt)
	and (not looks_python_like(code) or compute_relevancy_score(prompt, code, explanation) < 0.25)
	):
	fallback_code, fallback_explanation = synthesize_common_solution(prompt)
	if fallback_code:
	code = fallback_code
	explanation = fallback_explanation
	hallucination, hallucination_reason = check_hallucination(code, prompt=prompt)

	token_confidences = token_confidences or []
	if token_confidences:
	confidence = round(
	max(0.0, min(1.0, sum(token_confidences) / len(token_confidences))),
	4,
	)
	else:
	confidence = round(max(0.0, min(1.0, default_confidence)), 4)

	relevancy_score = compute_relevancy_score(prompt, code, explanation)
	important_tokens = []
	if tokenizer is not None and generated_ids is not None:
	important_tokens = extract_important_tokens(tokenizer, generated_ids, token_confidences)

	return {
	"code": code,
	"explanation": explanation,
	"confidence": safe_float(confidence),
	"important_tokens": important_tokens,
	"relevancy_score": safe_float(relevancy_score),
	"hallucination": hallucination,
	"hallucination_check_reason": hallucination_reason,
	"latency_ms": int(latency_ms),
	}


	def find_existing_path(candidates):
	for path in candidates:
	if os.path.exists(path):
	return path
	return None


	def has_adapter_weights(model_path):
	return find_existing_path(
	[
	os.path.join(model_path, "adapter_model.safetensors"),
	os.path.join(model_path, "adapter_model.bin"),
	]
	) is not None


	def has_full_model_weights(model_path):
	# Accept common local full-model weight names.
	direct_candidates = [
	os.path.join(model_path, "model.safetensors"),
	os.path.join(model_path, "pytorch_model.bin"),
	]
	if find_existing_path(direct_candidates):
	return True

	if os.path.isdir(model_path):
	for name in os.listdir(model_path):
	if name.startswith("model-") and name.endswith(".safetensors"):
	return True
	return False


	def main():
	import torch
	from peft import PeftConfig, PeftModel
	from transformers import AutoModelForCausalLM, AutoTokenizer

	parser = argparse.ArgumentParser()
	parser.add_argument("--model-path", type=str, default="./model")
	parser.add_argument("--base-model", type=str, default="Qwen/Qwen2.5-Coder-0.5B-Instruct")
	parser.add_argument("--prompt", type=str, required=True)
	parser.add_argument("--max-new-tokens", type=int, default=320)
	parser.add_argument("--temperature", type=float, default=0.25)
	parser.add_argument("--top-p", type=float, default=0.9)
	parser.add_argument("--do-sample", action="store_true")
	parser.add_argument(
	"--allow-downloads",
	action="store_true",
	help="Allow Transformers to download missing model files from Hugging Face.",
	)
	args = parser.parse_args()
	local_files_only = not args.allow_downloads

	if not os.path.exists(args.model_path):
	raise FileNotFoundError(
	f"Model path not found: {args.model_path}. Train first using run_pipeline.py."
	)

	adapter_config_path = os.path.join(args.model_path, "adapter_config.json")
	adapter_weights_present = has_adapter_weights(args.model_path)
	full_model_weights_present = has_full_model_weights(args.model_path)

	if os.path.exists(adapter_config_path) and adapter_weights_present:
	peft_config = PeftConfig.from_pretrained(args.model_path)
	base_model_name = peft_config.base_model_name_or_path or args.base_model
	tokenizer = AutoTokenizer.from_pretrained(
	base_model_name,
	local_files_only=local_files_only,
	)
	base_model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	local_files_only=local_files_only,
	)
	model = PeftModel.from_pretrained(base_model, args.model_path)
	elif full_model_weights_present and not os.path.exists(adapter_config_path):
	tokenizer = AutoTokenizer.from_pretrained(
	args.model_path,
	local_files_only=local_files_only,
	)
	model = AutoModelForCausalLM.from_pretrained(
	args.model_path,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	local_files_only=local_files_only,
	)
	else:
	# Graceful fallback when local model folder has config/tokenizer but no weight files.
	fallback_base = args.base_model
	if os.path.exists(adapter_config_path):
	try:
	peft_config = PeftConfig.from_pretrained(args.model_path)
	fallback_base = peft_config.base_model_name_or_path or args.base_model
	except Exception:
	fallback_base = args.base_model
	if full_model_weights_present and os.path.exists(adapter_config_path) and not adapter_weights_present:
	print(
	(
	"Warning: Detected full-model weights together with adapter config but missing "
	"adapter weights. This mixed state makes Transformers try adapter loading and fail. "
	"If you want strict local full-model loading, remove 'adapter_config.json' from "
	f"'{args.model_path}' or retrain and save consistent artifacts."
	),
	file=sys.stderr,
	)
	else:
	print(
	(
	"Warning: No local model weight files found in "
	f"'{args.model_path}'. Falling back to base model '{fallback_base}'. "
	"Run training again to generate adapter/full-model weights."
	),
	file=sys.stderr,
	)
	tokenizer = AutoTokenizer.from_pretrained(
	fallback_base,
	local_files_only=local_files_only,
	)
	model = AutoModelForCausalLM.from_pretrained(
	fallback_base,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	local_files_only=local_files_only,
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	model.eval()
	model.generation_config.do_sample = args.do_sample
	if not args.do_sample:
	# Neutralize sampling-only defaults saved in some checkpoints.
	model.generation_config.temperature = 1.0
	model.generation_config.top_p = 1.0
	model.generation_config.top_k = 50

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)
	prompt_text = build_instruction_prompt(args.prompt)
	inputs = tokenizer(prompt_text, return_tensors="pt").to(device)
	start_time = time.perf_counter()
	generation_kwargs = {
	"max_new_tokens": args.max_new_tokens,
	"output_scores": True,
	"return_dict_in_generate": True,
	"do_sample": args.do_sample,
	"pad_token_id": tokenizer.eos_token_id,
	}
	if args.do_sample:
	generation_kwargs["temperature"] = args.temperature
	generation_kwargs["top_p"] = args.top_p

	with torch.no_grad():
	generated = model.generate(inputs, generation_kwargs)
	latency_ms = int((time.perf_counter() - start_time) * 1000)

	output_ids = generated.sequences[0]
	prompt_len = inputs["input_ids"].shape[1]
	generated_ids = output_ids[prompt_len:].tolist()
	generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

	token_confidences = []
	if generated.scores:
	for token_id, score_tensor in zip(generated_ids, generated.scores):
	probs = torch.softmax(score_tensor[0], dim=-1)
	token_confidences.append(float(probs[token_id].item()))

	result = build_structured_result(
	args.prompt,
	generated_text,
	latency_ms,
	tokenizer=tokenizer,
	generated_ids=generated_ids,
	token_confidences=token_confidences,
	)
	print(json.dumps(result, indent=2, ensure_ascii=False))


	if __name__ == "__main__":
	main()