|
|
--- |
|
|
language: |
|
|
- en |
|
|
license: apache-2.0 |
|
|
tags: |
|
|
- text-generation |
|
|
- question-answering |
|
|
- faq |
|
|
- codebasics |
|
|
- education |
|
|
- bootcamp |
|
|
datasets: |
|
|
- custom |
|
|
library_name: pytorch |
|
|
pipeline_tag: text-generation |
|
|
--- |
|
|
|
|
|
# CodeBasics FAQ & Text Generation System |
|
|
|
|
|
An intelligent AI system for CodeBasics bootcamp questions with dual capabilities: |
|
|
- Smart FAQ retrieval for accurate answers to bootcamp questions |
|
|
- Text generation for general AI/ML topics |
|
|
|
|
|
## Model Details |
|
|
|
|
|
- **Developed by:** callidus |
|
|
- **Model type:** Hybrid (TF-IDF FAQ + Transformer) |
|
|
- **Language:** English |
|
|
- **License:** Apache 2.0 |
|
|
|
|
|
## Quick Start |
|
|
|
|
|
### Installation |
|
|
|
|
|
```bash |
|
|
pip install torch pandas scikit-learn huggingface_hub |
|
|
``` |
|
|
|
|
|
### Complete Inference Code |
|
|
|
|
|
Copy and paste this complete code to use the model: |
|
|
|
|
|
```python |
|
|
# ============================================================================ |
|
|
# COMBINED INFERENCE: TRANSFORMER MODEL + FAQ SYSTEM |
|
|
# ============================================================================ |
|
|
|
|
|
!pip install -q torch huggingface_hub pandas scikit-learn |
|
|
|
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torch.nn.functional as F |
|
|
import json |
|
|
import math |
|
|
from huggingface_hub import hf_hub_download, login |
|
|
import re |
|
|
import pandas as pd |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
import numpy as np |
|
|
|
|
|
# ============================================================================ |
|
|
# CONFIGURATION |
|
|
# ============================================================================ |
|
|
|
|
|
HF_TOKEN = "hf_your_token_here" # Replace with your token |
|
|
REPO_ID = "callidus/good" |
|
|
|
|
|
login(token=HF_TOKEN, add_to_git_credential=False) |
|
|
|
|
|
# ============================================================================ |
|
|
# TRANSFORMER MODEL ARCHITECTURE |
|
|
# ============================================================================ |
|
|
|
|
|
class MultiHeadAttention(nn.Module): |
|
|
def __init__(self, d_model, num_heads): |
|
|
super().__init__() |
|
|
assert d_model % num_heads == 0 |
|
|
self.d_model = d_model |
|
|
self.num_heads = num_heads |
|
|
self.d_k = d_model // num_heads |
|
|
self.W_q = nn.Linear(d_model, d_model) |
|
|
self.W_k = nn.Linear(d_model, d_model) |
|
|
self.W_v = nn.Linear(d_model, d_model) |
|
|
self.W_o = nn.Linear(d_model, d_model) |
|
|
|
|
|
def split_heads(self, x, batch_size): |
|
|
x = x.view(batch_size, -1, self.num_heads, self.d_k) |
|
|
return x.transpose(1, 2) |
|
|
|
|
|
def forward(self, x, mask=None): |
|
|
batch_size = x.size(0) |
|
|
Q = self.split_heads(self.W_q(x), batch_size) |
|
|
K = self.split_heads(self.W_k(x), batch_size) |
|
|
V = self.split_heads(self.W_v(x), batch_size) |
|
|
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k) |
|
|
if mask is not None: |
|
|
scores = scores.masked_fill(mask == 0, -1e9) |
|
|
attention_weights = F.softmax(scores, dim=-1) |
|
|
attention_output = torch.matmul(attention_weights, V) |
|
|
attention_output = attention_output.transpose(1, 2).contiguous() |
|
|
attention_output = attention_output.view(batch_size, -1, self.d_model) |
|
|
return self.W_o(attention_output), attention_weights |
|
|
|
|
|
class FeedForward(nn.Module): |
|
|
def __init__(self, d_model, d_ff, dropout=0.1): |
|
|
super().__init__() |
|
|
self.linear1 = nn.Linear(d_model, d_ff) |
|
|
self.linear2 = nn.Linear(d_ff, d_model) |
|
|
self.dropout = nn.Dropout(dropout) |
|
|
|
|
|
def forward(self, x): |
|
|
return self.linear2(self.dropout(F.relu(self.linear1(x)))) |
|
|
|
|
|
class TransformerBlock(nn.Module): |
|
|
def __init__(self, d_model, num_heads, d_ff, dropout=0.1): |
|
|
super().__init__() |
|
|
self.attention = MultiHeadAttention(d_model, num_heads) |
|
|
self.feed_forward = FeedForward(d_model, d_ff, dropout) |
|
|
self.norm1 = nn.LayerNorm(d_model) |
|
|
self.norm2 = nn.LayerNorm(d_model) |
|
|
self.dropout1 = nn.Dropout(dropout) |
|
|
self.dropout2 = nn.Dropout(dropout) |
|
|
|
|
|
def forward(self, x, mask=None): |
|
|
attn_output, attn_weights = self.attention(x, mask) |
|
|
x = self.norm1(x + self.dropout1(attn_output)) |
|
|
ff_output = self.feed_forward(x) |
|
|
x = self.norm2(x + self.dropout2(ff_output)) |
|
|
return x, attn_weights |
|
|
|
|
|
class PositionalEncoding(nn.Module): |
|
|
def __init__(self, d_model, max_len=5000): |
|
|
super().__init__() |
|
|
pe = torch.zeros(max_len, d_model) |
|
|
position = torch.arange(0, max_len).unsqueeze(1).float() |
|
|
div_term = torch.exp(torch.arange(0, d_model, 2).float() * |
|
|
-(math.log(10000.0) / d_model)) |
|
|
pe[:, 0::2] = torch.sin(position * div_term) |
|
|
pe[:, 1::2] = torch.cos(position * div_term) |
|
|
pe = pe.unsqueeze(0) |
|
|
self.register_buffer('pe', pe) |
|
|
|
|
|
def forward(self, x): |
|
|
return x + self.pe[:, :x.size(1)] |
|
|
|
|
|
class TransformerModel(nn.Module): |
|
|
def __init__(self, vocab_size, d_model=512, num_heads=8, |
|
|
num_layers=6, d_ff=2048, dropout=0.1, max_len=512): |
|
|
super().__init__() |
|
|
self.embedding = nn.Embedding(vocab_size, d_model) |
|
|
self.pos_encoding = PositionalEncoding(d_model, max_len) |
|
|
self.transformer_blocks = nn.ModuleList([ |
|
|
TransformerBlock(d_model, num_heads, d_ff, dropout) |
|
|
for _ in range(num_layers) |
|
|
]) |
|
|
self.fc_out = nn.Linear(d_model, vocab_size) |
|
|
self.dropout = nn.Dropout(dropout) |
|
|
self.d_model = d_model |
|
|
|
|
|
def forward(self, x, mask=None): |
|
|
x = self.embedding(x) * math.sqrt(self.d_model) |
|
|
x = self.pos_encoding(x) |
|
|
x = self.dropout(x) |
|
|
for transformer_block in self.transformer_blocks: |
|
|
x, attn_weights = transformer_block(x, mask) |
|
|
logits = self.fc_out(x) |
|
|
return logits |
|
|
|
|
|
class Tokenizer: |
|
|
def __init__(self, tokenizer_data): |
|
|
self.word2idx = tokenizer_data['word2idx'] |
|
|
self.idx2word = {int(k): v for k, v in tokenizer_data['idx2word'].items()} |
|
|
self.vocab_size = tokenizer_data['vocab_size'] |
|
|
self.special_tokens = tokenizer_data['special_tokens'] |
|
|
|
|
|
def encode(self, text): |
|
|
words = re.findall(r'\w+', text.lower()) |
|
|
return [self.word2idx.get(word, self.word2idx['<UNK>']) for word in words] |
|
|
|
|
|
def decode(self, indices): |
|
|
words = [] |
|
|
for idx in indices: |
|
|
if idx in self.idx2word: |
|
|
word = self.idx2word[idx] |
|
|
if word not in ['<PAD>', '<SOS>', '<EOS>']: |
|
|
words.append(word) |
|
|
return ' '.join(words) |
|
|
|
|
|
class TransformerInference: |
|
|
def __init__(self, repo_id, token=None, device=None): |
|
|
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
self.model = None |
|
|
self.tokenizer = None |
|
|
self.config = None |
|
|
self.token = token |
|
|
self.load_from_hub(repo_id) |
|
|
|
|
|
def load_from_hub(self, repo_id): |
|
|
config_path = hf_hub_download(repo_id=repo_id, filename="model_config.json", token=self.token) |
|
|
weights_path = hf_hub_download(repo_id=repo_id, filename="model_weights.pt", token=self.token) |
|
|
tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.json", token=self.token) |
|
|
|
|
|
with open(config_path, 'r') as f: |
|
|
self.config = json.load(f) |
|
|
|
|
|
with open(tokenizer_path, 'r') as f: |
|
|
tokenizer_data = json.load(f) |
|
|
self.tokenizer = Tokenizer(tokenizer_data) |
|
|
|
|
|
self.model = TransformerModel( |
|
|
vocab_size=self.config['vocab_size'], |
|
|
d_model=self.config['d_model'], |
|
|
num_heads=self.config['num_heads'], |
|
|
num_layers=self.config['num_layers'], |
|
|
d_ff=self.config['d_ff'], |
|
|
dropout=self.config.get('dropout', 0.1), |
|
|
max_len=self.config.get('max_len', 512) |
|
|
) |
|
|
|
|
|
state_dict = torch.load(weights_path, map_location=self.device, weights_only=True) |
|
|
self.model.load_state_dict(state_dict) |
|
|
self.model = self.model.to(self.device) |
|
|
self.model.eval() |
|
|
|
|
|
def generate(self, prompt, max_length=50, temperature=0.8, top_k=50, top_p=0.9): |
|
|
self.model.eval() |
|
|
tokens = self.tokenizer.encode(prompt) |
|
|
|
|
|
if not tokens or all(t == self.tokenizer.word2idx['<UNK>'] for t in tokens): |
|
|
tokens = [self.tokenizer.word2idx['<SOS>']] |
|
|
|
|
|
generated = tokens.copy() |
|
|
|
|
|
with torch.no_grad(): |
|
|
for _ in range(max_length): |
|
|
input_tokens = generated[-64:] |
|
|
if len(input_tokens) < 64: |
|
|
input_tokens = [self.tokenizer.word2idx['<PAD>']] * (64 - len(input_tokens)) + input_tokens |
|
|
|
|
|
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(self.device) |
|
|
logits = self.model(input_ids) |
|
|
next_token_logits = logits[0, -1, :] / temperature |
|
|
|
|
|
next_token_logits[self.tokenizer.word2idx['<PAD>']] = -float('inf') |
|
|
next_token_logits[self.tokenizer.word2idx['<UNK>']] = -float('inf') |
|
|
|
|
|
if top_k > 0: |
|
|
indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None] |
|
|
next_token_logits[indices_to_remove] = -float('inf') |
|
|
|
|
|
if top_p < 1.0: |
|
|
sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True) |
|
|
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) |
|
|
sorted_indices_to_remove = cumulative_probs > top_p |
|
|
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() |
|
|
sorted_indices_to_remove[..., 0] = 0 |
|
|
indices_to_remove = sorted_indices[sorted_indices_to_remove] |
|
|
next_token_logits[indices_to_remove] = -float('inf') |
|
|
|
|
|
probs = F.softmax(next_token_logits, dim=-1) |
|
|
next_token = torch.multinomial(probs, num_samples=1).item() |
|
|
|
|
|
if next_token == self.tokenizer.word2idx['<EOS>']: |
|
|
break |
|
|
|
|
|
generated.append(next_token) |
|
|
|
|
|
return self.tokenizer.decode(generated) |
|
|
|
|
|
# ============================================================================ |
|
|
# FAQ SYSTEM |
|
|
# ============================================================================ |
|
|
|
|
|
class CodeBasicsFAQ: |
|
|
def __init__(self, csv_path): |
|
|
encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252'] |
|
|
df = None |
|
|
|
|
|
for encoding in encodings: |
|
|
try: |
|
|
df = pd.read_csv(csv_path, encoding=encoding) |
|
|
break |
|
|
except: |
|
|
continue |
|
|
|
|
|
if df is None: |
|
|
raise Exception("Could not load FAQ CSV") |
|
|
|
|
|
self.df = df |
|
|
self.questions = df['prompt'].tolist() |
|
|
self.answers = df['response'].tolist() |
|
|
|
|
|
self.vectorizer = TfidfVectorizer( |
|
|
lowercase=True, |
|
|
stop_words='english', |
|
|
ngram_range=(1, 2), |
|
|
max_features=1000 |
|
|
) |
|
|
|
|
|
self.question_vectors = self.vectorizer.fit_transform(self.questions) |
|
|
|
|
|
def find_best_match(self, query, threshold=0.2): |
|
|
query_vector = self.vectorizer.transform([query]) |
|
|
similarities = cosine_similarity(query_vector, self.question_vectors)[0] |
|
|
|
|
|
best_idx = np.argmax(similarities) |
|
|
best_score = similarities[best_idx] |
|
|
|
|
|
if best_score >= threshold: |
|
|
return { |
|
|
'question': self.questions[best_idx], |
|
|
'answer': self.answers[best_idx], |
|
|
'confidence': best_score |
|
|
} |
|
|
return None |
|
|
|
|
|
# ============================================================================ |
|
|
# LOAD BOTH SYSTEMS |
|
|
# ============================================================================ |
|
|
|
|
|
print("Loading systems...") |
|
|
transformer = TransformerInference(repo_id=REPO_ID, token=HF_TOKEN) |
|
|
csv_path = hf_hub_download(repo_id=REPO_ID, filename="codebasics_faqs.csv", token=HF_TOKEN) |
|
|
faq = CodeBasicsFAQ(csv_path) |
|
|
print("Ready!") |
|
|
|
|
|
# ============================================================================ |
|
|
# SMART INFERENCE FUNCTION |
|
|
# ============================================================================ |
|
|
|
|
|
def smart_inference(query): |
|
|
"""Automatically chooses FAQ or text generation""" |
|
|
faq_match = faq.find_best_match(query) |
|
|
|
|
|
if faq_match: |
|
|
return faq_match['answer'] |
|
|
else: |
|
|
return transformer.generate(query, max_length=50, temperature=0.8) |
|
|
|
|
|
# ============================================================================ |
|
|
# USAGE |
|
|
# ============================================================================ |
|
|
|
|
|
# Ask questions - system automatically picks best method |
|
|
result = smart_inference("Can I take this bootcamp without programming experience?") |
|
|
print(result) |
|
|
|
|
|
# Interactive mode |
|
|
while True: |
|
|
user_input = input("Ask me: ").strip() |
|
|
if user_input.lower() in ['quit', 'exit']: |
|
|
break |
|
|
print(smart_inference(user_input)) |
|
|
``` |
|
|
|
|
|
## Usage Examples |
|
|
|
|
|
### FAQ Questions (Returns Accurate Answers) |
|
|
```python |
|
|
result = smart_inference("Can I take this bootcamp without programming experience?") |
|
|
# Returns: "Yes, this is the perfect bootcamp for anyone..." |
|
|
|
|
|
result = smart_inference("Why should I trust Codebasics?") |
|
|
# Returns: "Till now 9000+ learners have benefitted..." |
|
|
``` |
|
|
|
|
|
### General Topics (Returns Generated Text) |
|
|
```python |
|
|
result = smart_inference("machine learning algorithms") |
|
|
# Returns: Generated text about ML |
|
|
|
|
|
result = smart_inference("artificial intelligence") |
|
|
# Returns: Generated text about AI |
|
|
``` |
|
|
|
|
|
## Example Questions |
|
|
|
|
|
### Bootcamp Questions (FAQ System) |
|
|
- "Can I take this bootcamp without programming experience?" |
|
|
- "Why should I trust Codebasics?" |
|
|
- "What are the prerequisites?" |
|
|
- "Do you provide job assistance?" |
|
|
- "Is there lifetime access?" |
|
|
- "Can I attend while working full time?" |
|
|
- "What is the duration of this bootcamp?" |
|
|
|
|
|
### General Topics (Text Generation) |
|
|
- "machine learning" |
|
|
- "artificial intelligence" |
|
|
- "neural networks" |
|
|
- "data science" |
|
|
|
|
|
## Files in Repository |
|
|
|
|
|
- `codebasics_faqs.csv` - FAQ database (50+ Q&A pairs) |
|
|
- `model_config.json` - Transformer configuration |
|
|
- `model_weights.pt` - Transformer weights |
|
|
- `tokenizer.json` - Tokenizer vocabulary |
|
|
- `README.md` - This documentation |
|
|
|
|
|
## Model Architecture |
|
|
|
|
|
### FAQ System |
|
|
- **Method:** TF-IDF + Cosine Similarity |
|
|
- **Accuracy:** ~90% on similar phrasings |
|
|
- **Threshold:** 0.2 similarity score |
|
|
|
|
|
### Transformer Model |
|
|
- **Layers:** 6 transformer blocks |
|
|
- **Hidden size:** 512 |
|
|
- **Attention heads:** 8 |
|
|
- **Vocabulary:** 229 tokens |
|
|
- **Max length:** 512 tokens |
|
|
|
|
|
## How It Works |
|
|
|
|
|
The system intelligently routes queries: |
|
|
|
|
|
1. **FAQ Match?** → Returns accurate FAQ answer |
|
|
2. **No Match?** → Falls back to text generation |
|
|
|
|
|
Users don't need to specify which system to use - it's automatic! |
|
|
|
|
|
## Limitations |
|
|
|
|
|
- FAQ requires questions similar to training data |
|
|
- Text generation has limited vocabulary (229 tokens) |
|
|
- Best for CodeBasics bootcamp questions |
|
|
- English language only |
|
|
|
|
|
## Citation |
|
|
|
|
|
```bibtex |
|
|
@misc{codebasics-faq-2024, |
|
|
author = {callidus}, |
|
|
title = {CodeBasics FAQ and Text Generation System}, |
|
|
year = {2024}, |
|
|
publisher = {HuggingFace}, |
|
|
howpublished = {\url{https://huggingface.co/callidus/good}} |
|
|
} |
|
|
``` |
|
|
|
|
|
## License |
|
|
|
|
|
Apache 2.0 |
|
|
|
|
|
## Contact |
|
|
|
|
|
For CodeBasics courses: [codebasics.io](https://codebasics.io) |
|
|
|