Instructions to use LesterCerioli/LLM-GO with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use LesterCerioli/LLM-GO with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="LesterCerioli/LLM-GO")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("LesterCerioli/LLM-GO", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use LesterCerioli/LLM-GO with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "LesterCerioli/LLM-GO" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LesterCerioli/LLM-GO", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/LesterCerioli/LLM-GO
- SGLang
How to use LesterCerioli/LLM-GO with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "LesterCerioli/LLM-GO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LesterCerioli/LLM-GO", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "LesterCerioli/LLM-GO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LesterCerioli/LLM-GO", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use LesterCerioli/LLM-GO with Docker Model Runner:
docker model run hf.co/LesterCerioli/LLM-GO
| """ | |
| Data preprocessing pipeline. | |
| Steps: | |
| 1. Quality filtering (size, encoding, syntax check) | |
| 2. Deduplication via MinHash LSH | |
| 3. PII scrubbing (emails, tokens, secrets) | |
| 4. Tokenisation and packing into fixed-length sequences | |
| 5. Train / val / test split and TFRecord serialisation | |
| """ | |
| from __future__ import annotations | |
| import hashlib | |
| import re | |
| import struct | |
| from pathlib import Path | |
| from typing import Iterator | |
| import numpy as np | |
| import tensorflow as tf | |
| from rich.console import Console | |
| from rich.progress import track | |
| console = Console() | |
| # ---- Patterns to scrub ------------------------------------------------------- | |
| _EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}") | |
| _TOKEN_RE = re.compile( | |
| r"(?i)(ghp_|gho_|github_pat_|sk-|xox[baprs]-)[A-Za-z0-9_\-]{10,}" | |
| ) | |
| _SECRET_RE = re.compile( | |
| r"(?i)(password|passwd|secret|api.?key|access.?token)\s*[:=]\s*['\"]?[^\s'\"]{8,}" | |
| ) | |
| _IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b") | |
| # ---- MinHash config ---------------------------------------------------------- | |
| _MINHASH_SHINGLES = 5 # n-gram size for shingling | |
| _MINHASH_PERMS = 128 # number of hash permutations | |
| _LARGE_PRIME = (1 << 31) - 1 | |
| _MINHASH_A = np.array([ | |
| 1_543_763 + i * 2_654_435_761 for i in range(_MINHASH_PERMS) | |
| ], dtype=np.int64) | |
| _MINHASH_B = np.array([ | |
| 3_267_000_013 + i * 2_246_822_519 for i in range(_MINHASH_PERMS) | |
| ], dtype=np.int64) | |
| class GoPreprocessor: | |
| """Preprocess raw .go files into tokenised, packed TFRecord shards.""" | |
| def __init__( | |
| self, | |
| tokenizer, # GoTokenizer instance | |
| raw_dir: str | Path = "data/raw", | |
| out_dir: str | Path = "data/processed", | |
| seq_len: int = 2048, | |
| shard_size: int = 10_000, # sequences per TFRecord shard | |
| train_frac: float = 0.95, | |
| val_frac: float = 0.04, | |
| min_chars: int = 100, | |
| max_chars: int = 100_000, | |
| dedup_threshold: float = 0.8, | |
| ): | |
| self.tokenizer = tokenizer | |
| self.raw_dir = Path(raw_dir) | |
| self.out_dir = Path(out_dir) | |
| self.seq_len = seq_len | |
| self.shard_size = shard_size | |
| self.train_frac = train_frac | |
| self.val_frac = val_frac | |
| self.min_chars = min_chars | |
| self.max_chars = max_chars | |
| self.dedup_threshold = dedup_threshold | |
| self._seen_hashes: set[tuple] = set() | |
| for split in ("train", "val", "test"): | |
| (self.out_dir / split).mkdir(parents=True, exist_ok=True) | |
| # ------------------------------------------------------------------ | |
| # Public | |
| # ------------------------------------------------------------------ | |
| def run(self) -> dict[str, int]: | |
| """Full pipeline. Returns sequence counts per split.""" | |
| import itertools | |
| from llm_go.data.go_best_practices import GoProjectTemplates | |
| from llm_go.data.patterns.registry import PatternRegistry | |
| # ── Synthetic best-practice examples (cmd/ layout, idioms) | |
| # Repeated 5× so the model learns layout conventions with high weight | |
| templates = GoProjectTemplates() | |
| layout_texts = templates.all_examples() * 5 | |
| # ── Real-world patterns from Medical-App-Core | |
| # (Fiber + GORM + JWT + RabbitMQ + Docker) — repeated 3× | |
| registry = PatternRegistry() | |
| pattern_texts = registry.all_examples() * 3 | |
| console.print(f" [cyan]Layout examples: {len(layout_texts):,}") | |
| console.print(f" [cyan]Pattern examples: {len(pattern_texts):,}") | |
| console.print(registry.summary()) | |
| synthetic_texts = layout_texts + pattern_texts | |
| synthetic_ids = (self.tokenizer.encode(ex) for ex in synthetic_texts) | |
| real_ids = self._tokenise_files(self._iter_filtered_files()) | |
| combined = itertools.chain(synthetic_ids, real_ids) | |
| packed = list(self._pack_sequences(combined)) | |
| np.random.shuffle(packed) | |
| n = len(packed) | |
| n_train = int(n * self.train_frac) | |
| n_val = int(n * self.val_frac) | |
| splits = { | |
| "train": packed[:n_train], | |
| "val": packed[n_train : n_train + n_val], | |
| "test": packed[n_train + n_val :], | |
| } | |
| counts: dict[str, int] = {} | |
| for split, seqs in splits.items(): | |
| self._write_tfrecords(seqs, self.out_dir / split, split) | |
| counts[split] = len(seqs) | |
| console.print(f"[green]Done. Splits: {counts}") | |
| return counts | |
| # ------------------------------------------------------------------ | |
| # Filtering | |
| # ------------------------------------------------------------------ | |
| def _iter_filtered_files(self) -> Iterator[tuple[str, str]]: | |
| """Yield (path, cleaned_source) for files that pass all filters.""" | |
| all_files = list(self.raw_dir.rglob("*.go")) | |
| console.print(f"Found {len(all_files)} raw .go files") | |
| for path in track(all_files, description="Filtering…"): | |
| try: | |
| raw = path.read_bytes() | |
| source = raw.decode("utf-8", errors="ignore") | |
| except OSError: | |
| continue | |
| if not self.min_chars <= len(source) <= self.max_chars: | |
| continue | |
| if not self._is_valid_go(source): | |
| continue | |
| source = self._scrub_pii(source) | |
| sig = self._minhash(source) | |
| if self._is_duplicate(sig): | |
| continue | |
| self._seen_hashes.add(sig) | |
| yield str(path), source | |
| def _is_valid_go(self, source: str) -> bool: | |
| """Heuristic syntax check — must have package declaration.""" | |
| return bool(re.search(r"^package\s+\w+", source, re.MULTILINE)) | |
| def _scrub_pii(self, source: str) -> str: | |
| source = _EMAIL_RE.sub("<EMAIL>", source) | |
| source = _TOKEN_RE.sub(r"\1<REDACTED>", source) | |
| source = _SECRET_RE.sub(r"\1: <REDACTED>", source) | |
| return source | |
| # ------------------------------------------------------------------ | |
| # MinHash deduplication | |
| # ------------------------------------------------------------------ | |
| def _shingles(self, text: str) -> set[int]: | |
| words = text.split() | |
| return { | |
| hash(" ".join(words[i : i + _MINHASH_SHINGLES])) | |
| for i in range(len(words) - _MINHASH_SHINGLES + 1) | |
| } | |
| def _minhash(self, text: str) -> tuple[int, ...]: | |
| shingles = np.array(list(self._shingles(text)), dtype=np.int64) | |
| if len(shingles) == 0: | |
| return tuple(np.zeros(_MINHASH_PERMS, dtype=np.int64).tolist()) | |
| # MinHash: min over all shingles of (a*h + b) % p | |
| hvals = ( | |
| np.outer(shingles, _MINHASH_A) + _MINHASH_B[np.newaxis, :] | |
| ) % _LARGE_PRIME # [S, P] | |
| return tuple(hvals.min(axis=0).tolist()) | |
| def _is_duplicate(self, sig: tuple[int, ...]) -> bool: | |
| if not self._seen_hashes: | |
| return False | |
| # Approximate Jaccard via banded LSH (b bands of r rows, b*r = P) | |
| b, r = 32, 4 # 32 bands × 4 rows = 128 = _MINHASH_PERMS | |
| for band in range(b): | |
| band_sig = sig[band * r : (band + 1) * r] | |
| band_hash = hashlib.md5(struct.pack(f"{r}q", *band_sig)).digest() | |
| # Check against all previously seen band hashes (simplified) | |
| # In production, use datasketch.MinHashLSH for scalability | |
| for seen in self._seen_hashes: | |
| seen_band = seen[band * r : (band + 1) * r] | |
| seen_hash = hashlib.md5(struct.pack(f"{r}q", *seen_band)).digest() | |
| if band_hash == seen_hash: | |
| return True | |
| return False | |
| # ------------------------------------------------------------------ | |
| # Tokenisation & packing | |
| # ------------------------------------------------------------------ | |
| def _tokenise_files( | |
| self, files: Iterator[tuple[str, str]] | |
| ) -> Iterator[list[int]]: | |
| for path, source in files: | |
| # Extract Go version from directory name if available | |
| version = self._infer_version(path) | |
| ids = self.tokenizer.encode_go_file(source, version=version) | |
| yield ids | |
| def _infer_version(self, path: str) -> str: | |
| """Try to extract go version from go.mod in neighbouring files.""" | |
| p = Path(path) | |
| gomod = p.parent / "go.mod" | |
| if gomod.exists(): | |
| m = re.search(r"^go\s+(\d+\.\d+)", gomod.read_text(), re.MULTILINE) | |
| if m: | |
| return m.group(1) | |
| return "" | |
| def _pack_sequences(self, token_stream: Iterator[list[int]]) -> Iterator[list[int]]: | |
| """Concatenate token streams and yield fixed-length windows (no padding).""" | |
| buffer: list[int] = [] | |
| for ids in token_stream: | |
| buffer.extend(ids) | |
| while len(buffer) >= self.seq_len + 1: | |
| yield buffer[: self.seq_len + 1] # +1 for labels shift | |
| buffer = buffer[self.seq_len + 1:] | |
| # Drop remainder (avoids padding in packed pre-training) | |
| # ------------------------------------------------------------------ | |
| # TFRecord I/O | |
| # ------------------------------------------------------------------ | |
| def _write_tfrecords( | |
| self, sequences: list[list[int]], dest: Path, split: str | |
| ) -> None: | |
| n_shards = max(1, len(sequences) // self.shard_size) | |
| for shard_idx in range(n_shards): | |
| shard_path = dest / f"{split}-{shard_idx:05d}.tfrecord" | |
| with tf.io.TFRecordWriter(str(shard_path)) as writer: | |
| start = shard_idx * self.shard_size | |
| end = start + self.shard_size | |
| for seq in sequences[start:end]: | |
| feature = { | |
| "input_ids": tf.train.Feature( | |
| int64_list=tf.train.Int64List(value=seq) | |
| ) | |
| } | |
| writer.write( | |
| tf.train.Example( | |
| features=tf.train.Features(feature=feature) | |
| ).SerializeToString() | |
| ) | |
| console.print(f" [{split}] wrote {len(sequences)} sequences → {n_shards} shards") | |
| def load_dataset( | |
| tfrecord_dir: str | Path, | |
| seq_len: int = 2048, | |
| batch_size: int = 32, | |
| shuffle_buffer: int = 10_000, | |
| ) -> tf.data.Dataset: | |
| """Load a preprocessed split as a batched tf.data.Dataset.""" | |
| paths = sorted(Path(tfrecord_dir).glob("*.tfrecord")) | |
| raw = tf.data.TFRecordDataset( | |
| [str(p) for p in paths], | |
| num_parallel_reads=tf.data.AUTOTUNE, | |
| ) | |
| feature_spec = { | |
| "input_ids": tf.io.FixedLenSequenceFeature( | |
| [], tf.int64, allow_missing=True | |
| ) | |
| } | |
| def _parse(serialised: tf.Tensor) -> dict[str, tf.Tensor]: | |
| parsed = tf.io.parse_single_example(serialised, feature_spec) | |
| ids = tf.cast(parsed["input_ids"], tf.int32) | |
| return {"input_ids": ids[:-1], "labels": ids[1:]} | |
| return ( | |
| raw.map(_parse, num_parallel_calls=tf.data.AUTOTUNE) | |
| .shuffle(shuffle_buffer) | |
| .batch(batch_size, drop_remainder=True) | |
| .prefetch(tf.data.AUTOTUNE) | |
| ) | |