Text Generation
PEFT
Safetensors
Transformers
qwen2
lora
coding
code-generation
conversational
text-generation-inference
Instructions to use girish00/ConicAI_LLM_model with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use girish00/ConicAI_LLM_model with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-0.5B-Instruct") model = PeftModel.from_pretrained(base_model, "girish00/ConicAI_LLM_model") - Transformers
How to use girish00/ConicAI_LLM_model with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="girish00/ConicAI_LLM_model") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("girish00/ConicAI_LLM_model") model = AutoModelForCausalLM.from_pretrained("girish00/ConicAI_LLM_model") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use girish00/ConicAI_LLM_model with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "girish00/ConicAI_LLM_model" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "girish00/ConicAI_LLM_model", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/girish00/ConicAI_LLM_model
- SGLang
How to use girish00/ConicAI_LLM_model with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "girish00/ConicAI_LLM_model" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "girish00/ConicAI_LLM_model", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "girish00/ConicAI_LLM_model" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "girish00/ConicAI_LLM_model", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use girish00/ConicAI_LLM_model with Docker Model Runner:
docker model run hf.co/girish00/ConicAI_LLM_model
| import argparse | |
| import ast | |
| import json | |
| import os | |
| import re | |
| import sys | |
| import time | |
| def build_instruction_prompt(user_prompt): | |
| return ( | |
| "You are a coding assistant. Return ONLY valid JSON with this exact schema:\n" | |
| "{\n" | |
| ' "code": "string",\n' | |
| ' "explanation": "string"\n' | |
| "}\n" | |
| "Rules:\n" | |
| "- code must be practical, runnable, and directly answer the prompt.\n" | |
| "- explanation must clearly explain the code and key decisions.\n" | |
| "- no markdown fences, no extra keys, no additional text.\n\n" | |
| f"User prompt: {user_prompt}\n" | |
| "JSON:" | |
| ) | |
| def extract_first_json_object(text): | |
| start = text.find("{") | |
| if start == -1: | |
| return None | |
| depth = 0 | |
| for idx in range(start, len(text)): | |
| ch = text[idx] | |
| if ch == "{": | |
| depth += 1 | |
| elif ch == "}": | |
| depth -= 1 | |
| if depth == 0: | |
| candidate = text[start : idx + 1] | |
| try: | |
| return json.loads(candidate) | |
| except json.JSONDecodeError: | |
| return None | |
| return None | |
| def extract_markdown_code(text): | |
| match = re.search(r"```(?:python|py)?\s*(.*?)```", text, flags=re.DOTALL | re.IGNORECASE) | |
| if match: | |
| return match.group(1).strip() | |
| return "" | |
| def extract_fix_prompt_code(prompt): | |
| match = re.search( | |
| r"\b(?:fix|debug|repair)\s+this\s+code\s*:\s*(.+)$", | |
| prompt.strip(), | |
| flags=re.IGNORECASE | re.DOTALL, | |
| ) | |
| if match: | |
| return match.group(1).strip() | |
| return "" | |
| def fallback_parse_response(text, prompt=""): | |
| cleaned = text.strip() | |
| code = extract_markdown_code(cleaned) or extract_fix_prompt_code(prompt) or cleaned | |
| explanation = "Generated response from the model." | |
| if "Explanation:" in cleaned: | |
| parts = cleaned.split("Explanation:", 1) | |
| code = extract_markdown_code(parts[0]) or parts[0].strip() | |
| explanation = parts[1].strip() or explanation | |
| elif extract_markdown_code(cleaned): | |
| explanation = "Extracted the Python code block from the model response." | |
| return {"code": code, "explanation": explanation} | |
| def safe_float(value): | |
| try: | |
| return float(value) | |
| except (TypeError, ValueError): | |
| return 0.0 | |
| def compute_relevancy_score(prompt, code, explanation): | |
| words_pattern = r"[A-Za-z_][A-Za-z0-9_]+" | |
| prompt_tokens = set(re.findall(words_pattern, prompt.lower())) | |
| answer_tokens = set(re.findall(words_pattern, f"{code}\n{explanation}".lower())) | |
| if not prompt_tokens: | |
| return 0.0 | |
| overlap = len(prompt_tokens & answer_tokens) | |
| score = overlap / len(prompt_tokens) | |
| return round(max(0.0, min(1.0, score)), 4) | |
| def looks_python_like(code): | |
| python_like = any( | |
| marker in code | |
| for marker in ("def ", "import ", "class ", "print(", "return ", "for ", "if ") | |
| ) | |
| return python_like | |
| def prompt_expects_code(prompt): | |
| prompt_l = prompt.lower() | |
| intent_markers = ( | |
| "fix", | |
| "debug", | |
| "repair", | |
| "write", | |
| "create", | |
| "generate", | |
| "implement", | |
| "function", | |
| "code", | |
| "snippet", | |
| "python", | |
| "multiply", | |
| "multiplication", | |
| "product", | |
| "add", | |
| "addition", | |
| "sum", | |
| "subtract", | |
| "subtraction", | |
| "difference", | |
| "divide", | |
| "division", | |
| "quotient", | |
| ) | |
| return any(marker in prompt_l for marker in intent_markers) | |
| def check_hallucination(code, prompt=""): | |
| python_like = looks_python_like(code) | |
| if prompt_expects_code(prompt) and not python_like: | |
| return True, "Expected Python code, but output does not look like Python code." | |
| if not python_like: | |
| return False, "No Python syntax check required for this output." | |
| try: | |
| ast.parse(code) | |
| return False, "Python syntax check passed." | |
| except SyntaxError as exc: | |
| return True, f"Syntax error: {exc}" | |
| def repair_common_python_issues(code): | |
| fixed = code.strip() | |
| if not fixed: | |
| return fixed | |
| # Fix common "def ... return ..." one-line syntax issue. | |
| fixed = re.sub( | |
| r"^def\s+([A-Za-z_]\w*)\((.*?)\)\s+return\s+(.+)$", | |
| r"def \1(\2):\n return \3", | |
| fixed, | |
| flags=re.MULTILINE, | |
| ) | |
| # Fix assignment in conditional checks. | |
| fixed = re.sub(r"\bif\s+([A-Za-z_]\w*)\s*=\s*([^:]+):", r"if \1 == \2:", fixed) | |
| # Fix missing colon in for loops. | |
| fixed = re.sub(r"^(for\s+.+\))\s*$", r"\1:", fixed, flags=re.MULTILINE) | |
| return fixed | |
| def synthesize_common_solution(prompt): | |
| prompt_l = prompt.lower() | |
| prompt_code = extract_fix_prompt_code(prompt) | |
| repaired = repair_common_python_issues(prompt_code) | |
| if repaired and looks_python_like(repaired): | |
| hallucination, _ = check_hallucination(repaired, prompt=prompt) | |
| if not hallucination: | |
| return ( | |
| repaired, | |
| "Auto-repair applied for common Python syntax issues detected in the prompt.", | |
| ) | |
| operations = [ | |
| ( | |
| ("multiply", "multiplication", "product"), | |
| "multiply", | |
| "*", | |
| "multiplies two numbers", | |
| ), | |
| ( | |
| ("add", "addition", "sum"), | |
| "add", | |
| "+", | |
| "adds two numbers", | |
| ), | |
| ( | |
| ("subtract", "subtraction", "difference"), | |
| "subtract", | |
| "-", | |
| "subtracts the second number from the first", | |
| ), | |
| ( | |
| ("divide", "division", "quotient"), | |
| "divide", | |
| "/", | |
| "divides the first number by the second", | |
| ), | |
| ] | |
| for keywords, name, operator, description in operations: | |
| if any(keyword in prompt_l for keyword in keywords): | |
| return ( | |
| f"def {name}(a, b):\n return a {operator} b", | |
| f"This function {description} and returns the result.", | |
| ) | |
| return "", "" | |
| def maybe_apply_task_fallback(prompt, code, explanation, hallucination): | |
| prompt_l = prompt.lower() | |
| patched_code = code | |
| patched_explanation = explanation | |
| if hallucination and ("fix" in prompt_l or "debug" in prompt_l): | |
| repaired = repair_common_python_issues(code) | |
| if repaired and repaired != code: | |
| patched_code = repaired | |
| patched_explanation = ( | |
| explanation | |
| + " Auto-repair applied for common Python syntax issues detected in generated code." | |
| ).strip() | |
| if "linear regression" in prompt_l: | |
| if len(patched_code.strip()) < 60 or "LinearRegression" not in patched_code: | |
| patched_code = ( | |
| "import numpy as np\n" | |
| "from sklearn.linear_model import LinearRegression\n" | |
| "from sklearn.metrics import mean_squared_error, r2_score\n\n" | |
| "X = np.array([[1], [2], [3], [4], [5]])\n" | |
| "y = np.array([2, 4, 6, 8, 10])\n\n" | |
| "model = LinearRegression()\n" | |
| "model.fit(X, y)\n" | |
| "predictions = model.predict(X)\n\n" | |
| "mse = mean_squared_error(y, predictions)\n" | |
| "r2 = r2_score(y, predictions)\n\n" | |
| "print('Coefficients:', model.coef_)\n" | |
| "print('Intercept:', model.intercept_)\n" | |
| "print('Mean Squared Error (MSE):', mse)\n" | |
| "print('R-squared Score:', r2)" | |
| ) | |
| patched_explanation = ( | |
| "This creates and trains a Linear Regression model on sample data, then " | |
| "evaluates it using MSE and R-squared. It prints learned coefficients, " | |
| "intercept, and performance metrics." | |
| ) | |
| return patched_code, patched_explanation | |
| def extract_important_tokens(tokenizer, generated_ids, token_confidences, limit=5): | |
| if not generated_ids or not token_confidences: | |
| return [] | |
| pairs = list(zip(generated_ids, token_confidences)) | |
| pairs.sort(key=lambda x: x[1], reverse=True) | |
| top_ids = [token_id for token_id, _ in pairs[:limit]] | |
| decoded = [tokenizer.decode([tid]) for tid in top_ids] | |
| return [tok for tok in decoded if tok.strip()][:limit] | |
| def build_structured_result( | |
| prompt, | |
| generated_text, | |
| latency_ms, | |
| tokenizer=None, | |
| generated_ids=None, | |
| token_confidences=None, | |
| default_confidence=0.0, | |
| ): | |
| parsed = extract_first_json_object(generated_text) | |
| if parsed is None: | |
| parsed = fallback_parse_response(generated_text, prompt=prompt) | |
| code = str(parsed.get("code", "")).strip() | |
| explanation = str(parsed.get("explanation", "")).strip() | |
| if not code: | |
| code = extract_fix_prompt_code(prompt) or generated_text | |
| if not explanation: | |
| explanation = "Model did not provide a clear explanation." | |
| hallucination, hallucination_reason = check_hallucination(code, prompt=prompt) | |
| code, explanation = maybe_apply_task_fallback(prompt, code, explanation, hallucination) | |
| hallucination, hallucination_reason = check_hallucination(code, prompt=prompt) | |
| if hallucination and ("fix" in prompt.lower() or "debug" in prompt.lower()): | |
| prompt_code = extract_fix_prompt_code(prompt) | |
| repaired = repair_common_python_issues(prompt_code) | |
| if repaired and repaired != code: | |
| prompt_hallucination, prompt_reason = check_hallucination(repaired, prompt=prompt) | |
| if not prompt_hallucination: | |
| code = repaired | |
| explanation = ( | |
| "This fixes the Python syntax by adding the missing colon after the " | |
| "function definition and indenting the return statement." | |
| ) | |
| hallucination = False | |
| hallucination_reason = prompt_reason | |
| if hallucination or ( | |
| prompt_expects_code(prompt) | |
| and (not looks_python_like(code) or compute_relevancy_score(prompt, code, explanation) < 0.25) | |
| ): | |
| fallback_code, fallback_explanation = synthesize_common_solution(prompt) | |
| if fallback_code: | |
| code = fallback_code | |
| explanation = fallback_explanation | |
| hallucination, hallucination_reason = check_hallucination(code, prompt=prompt) | |
| token_confidences = token_confidences or [] | |
| if token_confidences: | |
| confidence = round( | |
| max(0.0, min(1.0, sum(token_confidences) / len(token_confidences))), | |
| 4, | |
| ) | |
| else: | |
| confidence = round(max(0.0, min(1.0, default_confidence)), 4) | |
| relevancy_score = compute_relevancy_score(prompt, code, explanation) | |
| important_tokens = [] | |
| if tokenizer is not None and generated_ids is not None: | |
| important_tokens = extract_important_tokens(tokenizer, generated_ids, token_confidences) | |
| return { | |
| "code": code, | |
| "explanation": explanation, | |
| "confidence": safe_float(confidence), | |
| "important_tokens": important_tokens, | |
| "relevancy_score": safe_float(relevancy_score), | |
| "hallucination": hallucination, | |
| "hallucination_check_reason": hallucination_reason, | |
| "latency_ms": int(latency_ms), | |
| } | |
| def find_existing_path(candidates): | |
| for path in candidates: | |
| if os.path.exists(path): | |
| return path | |
| return None | |
| def has_adapter_weights(model_path): | |
| return find_existing_path( | |
| [ | |
| os.path.join(model_path, "adapter_model.safetensors"), | |
| os.path.join(model_path, "adapter_model.bin"), | |
| ] | |
| ) is not None | |
| def has_full_model_weights(model_path): | |
| # Accept common local full-model weight names. | |
| direct_candidates = [ | |
| os.path.join(model_path, "model.safetensors"), | |
| os.path.join(model_path, "pytorch_model.bin"), | |
| ] | |
| if find_existing_path(direct_candidates): | |
| return True | |
| if os.path.isdir(model_path): | |
| for name in os.listdir(model_path): | |
| if name.startswith("model-") and name.endswith(".safetensors"): | |
| return True | |
| return False | |
| def main(): | |
| import torch | |
| from peft import PeftConfig, PeftModel | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--model-path", type=str, default="./model") | |
| parser.add_argument("--base-model", type=str, default="Qwen/Qwen2.5-Coder-0.5B-Instruct") | |
| parser.add_argument("--prompt", type=str, required=True) | |
| parser.add_argument("--max-new-tokens", type=int, default=320) | |
| parser.add_argument("--temperature", type=float, default=0.25) | |
| parser.add_argument("--top-p", type=float, default=0.9) | |
| parser.add_argument("--do-sample", action="store_true") | |
| parser.add_argument( | |
| "--allow-downloads", | |
| action="store_true", | |
| help="Allow Transformers to download missing model files from Hugging Face.", | |
| ) | |
| args = parser.parse_args() | |
| local_files_only = not args.allow_downloads | |
| if not os.path.exists(args.model_path): | |
| raise FileNotFoundError( | |
| f"Model path not found: {args.model_path}. Train first using run_pipeline.py." | |
| ) | |
| adapter_config_path = os.path.join(args.model_path, "adapter_config.json") | |
| adapter_weights_present = has_adapter_weights(args.model_path) | |
| full_model_weights_present = has_full_model_weights(args.model_path) | |
| if os.path.exists(adapter_config_path) and adapter_weights_present: | |
| peft_config = PeftConfig.from_pretrained(args.model_path) | |
| base_model_name = peft_config.base_model_name_or_path or args.base_model | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| base_model_name, | |
| local_files_only=local_files_only, | |
| ) | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| base_model_name, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| local_files_only=local_files_only, | |
| ) | |
| model = PeftModel.from_pretrained(base_model, args.model_path) | |
| elif full_model_weights_present and not os.path.exists(adapter_config_path): | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| args.model_path, | |
| local_files_only=local_files_only, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| args.model_path, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| local_files_only=local_files_only, | |
| ) | |
| else: | |
| # Graceful fallback when local model folder has config/tokenizer but no weight files. | |
| fallback_base = args.base_model | |
| if os.path.exists(adapter_config_path): | |
| try: | |
| peft_config = PeftConfig.from_pretrained(args.model_path) | |
| fallback_base = peft_config.base_model_name_or_path or args.base_model | |
| except Exception: | |
| fallback_base = args.base_model | |
| if full_model_weights_present and os.path.exists(adapter_config_path) and not adapter_weights_present: | |
| print( | |
| ( | |
| "Warning: Detected full-model weights together with adapter config but missing " | |
| "adapter weights. This mixed state makes Transformers try adapter loading and fail. " | |
| "If you want strict local full-model loading, remove 'adapter_config.json' from " | |
| f"'{args.model_path}' or retrain and save consistent artifacts." | |
| ), | |
| file=sys.stderr, | |
| ) | |
| else: | |
| print( | |
| ( | |
| "Warning: No local model weight files found in " | |
| f"'{args.model_path}'. Falling back to base model '{fallback_base}'. " | |
| "Run training again to generate adapter/full-model weights." | |
| ), | |
| file=sys.stderr, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| fallback_base, | |
| local_files_only=local_files_only, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| fallback_base, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| local_files_only=local_files_only, | |
| ) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model.eval() | |
| model.generation_config.do_sample = args.do_sample | |
| if not args.do_sample: | |
| # Neutralize sampling-only defaults saved in some checkpoints. | |
| model.generation_config.temperature = 1.0 | |
| model.generation_config.top_p = 1.0 | |
| model.generation_config.top_k = 50 | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| prompt_text = build_instruction_prompt(args.prompt) | |
| inputs = tokenizer(prompt_text, return_tensors="pt").to(device) | |
| start_time = time.perf_counter() | |
| generation_kwargs = { | |
| "max_new_tokens": args.max_new_tokens, | |
| "output_scores": True, | |
| "return_dict_in_generate": True, | |
| "do_sample": args.do_sample, | |
| "pad_token_id": tokenizer.eos_token_id, | |
| } | |
| if args.do_sample: | |
| generation_kwargs["temperature"] = args.temperature | |
| generation_kwargs["top_p"] = args.top_p | |
| with torch.no_grad(): | |
| generated = model.generate(**inputs, **generation_kwargs) | |
| latency_ms = int((time.perf_counter() - start_time) * 1000) | |
| output_ids = generated.sequences[0] | |
| prompt_len = inputs["input_ids"].shape[1] | |
| generated_ids = output_ids[prompt_len:].tolist() | |
| generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip() | |
| token_confidences = [] | |
| if generated.scores: | |
| for token_id, score_tensor in zip(generated_ids, generated.scores): | |
| probs = torch.softmax(score_tensor[0], dim=-1) | |
| token_confidences.append(float(probs[token_id].item())) | |
| result = build_structured_result( | |
| args.prompt, | |
| generated_text, | |
| latency_ms, | |
| tokenizer=tokenizer, | |
| generated_ids=generated_ids, | |
| token_confidences=token_confidences, | |
| ) | |
| print(json.dumps(result, indent=2, ensure_ascii=False)) | |
| if __name__ == "__main__": | |
| main() | |