--- language: - en license: apache-2.0 tags: - text-generation - question-answering - faq - codebasics - education - bootcamp datasets: - custom library_name: pytorch pipeline_tag: text-generation --- # CodeBasics FAQ & Text Generation System An intelligent AI system for CodeBasics bootcamp questions with dual capabilities: - Smart FAQ retrieval for accurate answers to bootcamp questions - Text generation for general AI/ML topics ## Model Details - **Developed by:** callidus - **Model type:** Hybrid (TF-IDF FAQ + Transformer) - **Language:** English - **License:** Apache 2.0 ## Quick Start ### Installation ```bash pip install torch pandas scikit-learn huggingface_hub ``` ### Complete Inference Code Copy and paste this complete code to use the model: ```python # ============================================================================ # COMBINED INFERENCE: TRANSFORMER MODEL + FAQ SYSTEM # ============================================================================ !pip install -q torch huggingface_hub pandas scikit-learn import torch import torch.nn as nn import torch.nn.functional as F import json import math from huggingface_hub import hf_hub_download, login import re import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np # ============================================================================ # CONFIGURATION # ============================================================================ HF_TOKEN = "hf_your_token_here" # Replace with your token REPO_ID = "callidus/good" login(token=HF_TOKEN, add_to_git_credential=False) # ============================================================================ # TRANSFORMER MODEL ARCHITECTURE # ============================================================================ class MultiHeadAttention(nn.Module): def __init__(self, d_model, num_heads): super().__init__() assert d_model % num_heads == 0 self.d_model = d_model self.num_heads = num_heads self.d_k = d_model // num_heads self.W_q = nn.Linear(d_model, d_model) self.W_k = nn.Linear(d_model, d_model) self.W_v = nn.Linear(d_model, d_model) self.W_o = nn.Linear(d_model, d_model) def split_heads(self, x, batch_size): x = x.view(batch_size, -1, self.num_heads, self.d_k) return x.transpose(1, 2) def forward(self, x, mask=None): batch_size = x.size(0) Q = self.split_heads(self.W_q(x), batch_size) K = self.split_heads(self.W_k(x), batch_size) V = self.split_heads(self.W_v(x), batch_size) scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k) if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) attention_weights = F.softmax(scores, dim=-1) attention_output = torch.matmul(attention_weights, V) attention_output = attention_output.transpose(1, 2).contiguous() attention_output = attention_output.view(batch_size, -1, self.d_model) return self.W_o(attention_output), attention_weights class FeedForward(nn.Module): def __init__(self, d_model, d_ff, dropout=0.1): super().__init__() self.linear1 = nn.Linear(d_model, d_ff) self.linear2 = nn.Linear(d_ff, d_model) self.dropout = nn.Dropout(dropout) def forward(self, x): return self.linear2(self.dropout(F.relu(self.linear1(x)))) class TransformerBlock(nn.Module): def __init__(self, d_model, num_heads, d_ff, dropout=0.1): super().__init__() self.attention = MultiHeadAttention(d_model, num_heads) self.feed_forward = FeedForward(d_model, d_ff, dropout) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) def forward(self, x, mask=None): attn_output, attn_weights = self.attention(x, mask) x = self.norm1(x + self.dropout1(attn_output)) ff_output = self.feed_forward(x) x = self.norm2(x + self.dropout2(ff_output)) return x, attn_weights class PositionalEncoding(nn.Module): def __init__(self, d_model, max_len=5000): super().__init__() pe = torch.zeros(max_len, d_model) position = torch.arange(0, max_len).unsqueeze(1).float() div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) self.register_buffer('pe', pe) def forward(self, x): return x + self.pe[:, :x.size(1)] class TransformerModel(nn.Module): def __init__(self, vocab_size, d_model=512, num_heads=8, num_layers=6, d_ff=2048, dropout=0.1, max_len=512): super().__init__() self.embedding = nn.Embedding(vocab_size, d_model) self.pos_encoding = PositionalEncoding(d_model, max_len) self.transformer_blocks = nn.ModuleList([ TransformerBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers) ]) self.fc_out = nn.Linear(d_model, vocab_size) self.dropout = nn.Dropout(dropout) self.d_model = d_model def forward(self, x, mask=None): x = self.embedding(x) * math.sqrt(self.d_model) x = self.pos_encoding(x) x = self.dropout(x) for transformer_block in self.transformer_blocks: x, attn_weights = transformer_block(x, mask) logits = self.fc_out(x) return logits class Tokenizer: def __init__(self, tokenizer_data): self.word2idx = tokenizer_data['word2idx'] self.idx2word = {int(k): v for k, v in tokenizer_data['idx2word'].items()} self.vocab_size = tokenizer_data['vocab_size'] self.special_tokens = tokenizer_data['special_tokens'] def encode(self, text): words = re.findall(r'\w+', text.lower()) return [self.word2idx.get(word, self.word2idx['']) for word in words] def decode(self, indices): words = [] for idx in indices: if idx in self.idx2word: word = self.idx2word[idx] if word not in ['', '', '']: words.append(word) return ' '.join(words) class TransformerInference: def __init__(self, repo_id, token=None, device=None): self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu') self.model = None self.tokenizer = None self.config = None self.token = token self.load_from_hub(repo_id) def load_from_hub(self, repo_id): config_path = hf_hub_download(repo_id=repo_id, filename="model_config.json", token=self.token) weights_path = hf_hub_download(repo_id=repo_id, filename="model_weights.pt", token=self.token) tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.json", token=self.token) with open(config_path, 'r') as f: self.config = json.load(f) with open(tokenizer_path, 'r') as f: tokenizer_data = json.load(f) self.tokenizer = Tokenizer(tokenizer_data) self.model = TransformerModel( vocab_size=self.config['vocab_size'], d_model=self.config['d_model'], num_heads=self.config['num_heads'], num_layers=self.config['num_layers'], d_ff=self.config['d_ff'], dropout=self.config.get('dropout', 0.1), max_len=self.config.get('max_len', 512) ) state_dict = torch.load(weights_path, map_location=self.device, weights_only=True) self.model.load_state_dict(state_dict) self.model = self.model.to(self.device) self.model.eval() def generate(self, prompt, max_length=50, temperature=0.8, top_k=50, top_p=0.9): self.model.eval() tokens = self.tokenizer.encode(prompt) if not tokens or all(t == self.tokenizer.word2idx[''] for t in tokens): tokens = [self.tokenizer.word2idx['']] generated = tokens.copy() with torch.no_grad(): for _ in range(max_length): input_tokens = generated[-64:] if len(input_tokens) < 64: input_tokens = [self.tokenizer.word2idx['']] * (64 - len(input_tokens)) + input_tokens input_ids = torch.tensor([input_tokens], dtype=torch.long).to(self.device) logits = self.model(input_ids) next_token_logits = logits[0, -1, :] / temperature next_token_logits[self.tokenizer.word2idx['']] = -float('inf') next_token_logits[self.tokenizer.word2idx['']] = -float('inf') if top_k > 0: indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None] next_token_logits[indices_to_remove] = -float('inf') if top_p < 1.0: sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) sorted_indices_to_remove = cumulative_probs > top_p sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 indices_to_remove = sorted_indices[sorted_indices_to_remove] next_token_logits[indices_to_remove] = -float('inf') probs = F.softmax(next_token_logits, dim=-1) next_token = torch.multinomial(probs, num_samples=1).item() if next_token == self.tokenizer.word2idx['']: break generated.append(next_token) return self.tokenizer.decode(generated) # ============================================================================ # FAQ SYSTEM # ============================================================================ class CodeBasicsFAQ: def __init__(self, csv_path): encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252'] df = None for encoding in encodings: try: df = pd.read_csv(csv_path, encoding=encoding) break except: continue if df is None: raise Exception("Could not load FAQ CSV") self.df = df self.questions = df['prompt'].tolist() self.answers = df['response'].tolist() self.vectorizer = TfidfVectorizer( lowercase=True, stop_words='english', ngram_range=(1, 2), max_features=1000 ) self.question_vectors = self.vectorizer.fit_transform(self.questions) def find_best_match(self, query, threshold=0.2): query_vector = self.vectorizer.transform([query]) similarities = cosine_similarity(query_vector, self.question_vectors)[0] best_idx = np.argmax(similarities) best_score = similarities[best_idx] if best_score >= threshold: return { 'question': self.questions[best_idx], 'answer': self.answers[best_idx], 'confidence': best_score } return None # ============================================================================ # LOAD BOTH SYSTEMS # ============================================================================ print("Loading systems...") transformer = TransformerInference(repo_id=REPO_ID, token=HF_TOKEN) csv_path = hf_hub_download(repo_id=REPO_ID, filename="codebasics_faqs.csv", token=HF_TOKEN) faq = CodeBasicsFAQ(csv_path) print("Ready!") # ============================================================================ # SMART INFERENCE FUNCTION # ============================================================================ def smart_inference(query): """Automatically chooses FAQ or text generation""" faq_match = faq.find_best_match(query) if faq_match: return faq_match['answer'] else: return transformer.generate(query, max_length=50, temperature=0.8) # ============================================================================ # USAGE # ============================================================================ # Ask questions - system automatically picks best method result = smart_inference("Can I take this bootcamp without programming experience?") print(result) # Interactive mode while True: user_input = input("Ask me: ").strip() if user_input.lower() in ['quit', 'exit']: break print(smart_inference(user_input)) ``` ## Usage Examples ### FAQ Questions (Returns Accurate Answers) ```python result = smart_inference("Can I take this bootcamp without programming experience?") # Returns: "Yes, this is the perfect bootcamp for anyone..." result = smart_inference("Why should I trust Codebasics?") # Returns: "Till now 9000+ learners have benefitted..." ``` ### General Topics (Returns Generated Text) ```python result = smart_inference("machine learning algorithms") # Returns: Generated text about ML result = smart_inference("artificial intelligence") # Returns: Generated text about AI ``` ## Example Questions ### Bootcamp Questions (FAQ System) - "Can I take this bootcamp without programming experience?" - "Why should I trust Codebasics?" - "What are the prerequisites?" - "Do you provide job assistance?" - "Is there lifetime access?" - "Can I attend while working full time?" - "What is the duration of this bootcamp?" ### General Topics (Text Generation) - "machine learning" - "artificial intelligence" - "neural networks" - "data science" ## Files in Repository - `codebasics_faqs.csv` - FAQ database (50+ Q&A pairs) - `model_config.json` - Transformer configuration - `model_weights.pt` - Transformer weights - `tokenizer.json` - Tokenizer vocabulary - `README.md` - This documentation ## Model Architecture ### FAQ System - **Method:** TF-IDF + Cosine Similarity - **Accuracy:** ~90% on similar phrasings - **Threshold:** 0.2 similarity score ### Transformer Model - **Layers:** 6 transformer blocks - **Hidden size:** 512 - **Attention heads:** 8 - **Vocabulary:** 229 tokens - **Max length:** 512 tokens ## How It Works The system intelligently routes queries: 1. **FAQ Match?** → Returns accurate FAQ answer 2. **No Match?** → Falls back to text generation Users don't need to specify which system to use - it's automatic! ## Limitations - FAQ requires questions similar to training data - Text generation has limited vocabulary (229 tokens) - Best for CodeBasics bootcamp questions - English language only ## Citation ```bibtex @misc{codebasics-faq-2024, author = {callidus}, title = {CodeBasics FAQ and Text Generation System}, year = {2024}, publisher = {HuggingFace}, howpublished = {\url{https://huggingface.co/callidus/good}} } ``` ## License Apache 2.0 ## Contact For CodeBasics courses: [codebasics.io](https://codebasics.io)