SpencerCPurdy's picture
Update app.py
6cf410e verified
"""
Fine-Tuned RAG Framework for Python Documentation Q&A
Author: Spencer Purdy
Description: Production-ready RAG system that answers questions about Python's standard library.
Uses fine-tuned GPT-2 model with vector search for accurate, grounded responses.
Data Source: Python 3 Documentation (PSF License - https://docs.python.org/3/license.html)
Model: GPT-2 Small (124M parameters) fine-tuned with LoRA
Vector Store: ChromaDB with sentence-transformers embeddings
IMPORTANT LIMITATIONS:
- Limited to Python standard library knowledge (no third-party packages)
- May not have information on Python versions newer than training data
- Best for conceptual questions; may struggle with very specific version details
- Responses are based on retrieved documentation chunks; may miss context from other sections
- Fine-tuning improves relevance but does not guarantee factual accuracy
- Not a replacement for official documentation - always verify critical information
This system is designed to demonstrate ML engineering skills including:
- Data collection and preprocessing
- Model fine-tuning with LoRA/PEFT
- RAG pipeline implementation
- Comprehensive evaluation metrics
- Production-ready error handling
Model Performance (Validated on Test Set):
- Retrieval Accuracy: ~94%
- ROUGE-L F1: ~0.08
- BERTScore F1: ~0.80
- Average Latency: ~2 seconds
Limitations:
- Limited to Python standard library
- Best for Python 3.x (may have gaps for latest versions)
- Always verify critical information with official docs
- Not suitable for production use without further validation
Reproducibility:
- Random seed: 42 (set across all libraries)
- All dependency versions specified
- Deterministic training process
"""
# ============================================================================
# INSTALLATION
# ============================================================================
# !pip install -q torch transformers datasets peft gradio pandas numpy scikit-learn tqdm requests beautifulsoup4 rouge-score bert-score accelerate sentence-transformers chromadb
# ============================================================================
# IMPORTS
# ============================================================================
import os
import json
import time
import logging
import warnings
import re
import random
import gc
import requests
import shutil
from datetime import datetime
from typing import List, Dict, Tuple, Optional, Any, Union
from dataclasses import dataclass, field, asdict
from collections import defaultdict
import traceback
# Disable warnings and telemetry for cleaner output
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["ANONYMIZED_TELEMETRY"] = "False"
# Core ML libraries
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.auto import tqdm
# Transformers and PEFT for model fine-tuning
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
set_seed
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from datasets import Dataset
# Vector database and embeddings
import chromadb
from sentence_transformers import SentenceTransformer
# Evaluation metrics
from rouge_score import rouge_scorer
try:
from bert_score import score as bert_score
BERTSCORE_AVAILABLE = True
except Exception as e:
print(f"BERTScore not available: {e}")
BERTSCORE_AVAILABLE = False
# UI framework
import gradio as gr
# Web scraping for data collection
from bs4 import BeautifulSoup
# ============================================================================
# REPRODUCIBILITY SETUP
# ============================================================================
RANDOM_SEED = 42
def set_all_seeds(seed: int = RANDOM_SEED):
"""
Set random seeds for all libraries to ensure reproducibility.
This makes the training process deterministic across runs.
"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
set_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_all_seeds(RANDOM_SEED)
# ============================================================================
# LOGGING SETUP
# ============================================================================
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Clear GPU cache and set device
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
device = torch.device("cuda")
logger.info(f"GPU available: {torch.cuda.get_device_name(0)}")
else:
device = torch.device("cpu")
logger.info("Running on CPU")
# ============================================================================
# SYSTEM CONFIGURATION
# ============================================================================
@dataclass
class SystemConfig:
"""
Comprehensive system configuration.
All hyperparameters are documented with rationale.
"""
# Model configuration
base_model_name: str = "gpt2"
embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
# Fine-tuning parameters optimized for Colab
num_train_epochs: int = 3
per_device_train_batch_size: int = 4
gradient_accumulation_steps: int = 4
learning_rate: float = 2e-4
warmup_steps: int = 100
max_steps: int = 500
logging_steps: int = 50
save_steps: int = 250
eval_steps: int = 250
# LoRA configuration for parameter-efficient fine-tuning
lora_r: int = 16
lora_alpha: int = 32
lora_dropout: float = 0.05
lora_target_modules: List[str] = field(default_factory=lambda: ["c_attn", "c_proj"])
# Generation parameters tuned for concise, accurate responses
max_input_length: int = 512
max_new_tokens: int = 150
temperature: float = 0.7
top_p: float = 0.9
top_k: int = 50
repetition_penalty: float = 1.2
# RAG parameters
chunk_size: int = 400
chunk_overlap: int = 50
retrieval_top_k: int = 3
min_relevance_score: float = 0.15
# Data collection
max_documents: int = 150
# Paths
model_save_path: str = "./checkpoint-500"
vector_db_path: str = "."
data_cache_path: str = "./python_docs_cache.json"
# Evaluation
eval_sample_size: int = 50
# Random seed for reproducibility
random_seed: int = RANDOM_SEED
config = SystemConfig()
# Log configuration
logger.info("=" * 70)
logger.info("Fine-Tuned RAG Framework - Configuration")
logger.info("=" * 70)
logger.info(f"Base Model: {config.base_model_name}")
logger.info(f"Embedding Model: {config.embedding_model_name}")
logger.info(f"Random Seed: {config.random_seed} (for reproducibility)")
logger.info(f"Device: {device}")
logger.info(f"Training Steps: {config.max_steps}")
logger.info(f"LoRA Rank: {config.lora_r}")
logger.info(f"Min Relevance Score: {config.min_relevance_score}")
logger.info("=" * 70)
# ============================================================================
# DATA COLLECTION: Python Documentation
# ============================================================================
class PythonDocsCollector:
"""
Collects Python standard library documentation from official sources.
Includes both API reference and tutorial/concept pages for comprehensive coverage.
Data Source: https://docs.python.org/3/
License: PSF License (https://docs.python.org/3/license.html)
The Python Software Foundation License is GPL-compatible and allows
redistribution and modification with proper attribution.
"""
def __init__(self, cache_path: str = config.data_cache_path):
self.cache_path = cache_path
self.base_url = "https://docs.python.org/3/"
self.collected_docs = []
def collect_documentation(self, max_docs: int = config.max_documents) -> List[Dict[str, str]]:
"""
Collect Python documentation with proper error handling.
Uses caching to avoid redundant network requests.
Collects both library reference and tutorial content for better conceptual coverage.
Returns:
List of dictionaries with title, content, url, and module keys
"""
# Check cache first to avoid redundant web requests
if os.path.exists(self.cache_path):
logger.info(f"Loading cached documentation from {self.cache_path}")
with open(self.cache_path, 'r', encoding='utf-8') as f:
return json.load(f)
logger.info("Collecting Python documentation from official sources...")
# Core Python standard library modules and tutorial pages
pages = [
# Core language features and tutorials
"tutorial/introduction.html",
"tutorial/controlflow.html",
"tutorial/datastructures.html",
"tutorial/modules.html",
"tutorial/inputoutput.html",
"tutorial/errors.html",
"tutorial/classes.html",
"tutorial/stdlib.html",
"tutorial/stdlib2.html",
# Language reference
"reference/expressions.html",
"reference/compound_stmts.html",
"reference/datamodel.html",
# Standard library reference
"library/intro.html",
"library/functions.html",
"library/constants.html",
"library/stdtypes.html",
"library/exceptions.html",
"library/string.html",
"library/re.html",
"library/datetime.html",
"library/collections.html",
"library/collections.abc.html",
"library/itertools.html",
"library/functools.html",
"library/operator.html",
"library/pathlib.html",
"library/os.html",
"library/os.path.html",
"library/io.html",
"library/json.html",
"library/csv.html",
"library/pickle.html",
"library/sqlite3.html",
"library/math.html",
"library/random.html",
"library/statistics.html",
"library/sys.html",
"library/typing.html",
"library/unittest.html",
"library/logging.html",
"library/threading.html",
"library/multiprocessing.html",
"library/subprocess.html",
"library/socket.html",
"library/http.html",
"library/urllib.html",
"library/email.html",
"library/argparse.html",
"library/getopt.html",
"library/tempfile.html",
"library/glob.html",
"library/shutil.html",
"library/zipfile.html",
"library/gzip.html",
"library/hashlib.html",
"library/hmac.html",
"library/secrets.html",
"library/time.html",
"library/calendar.html",
"library/enum.html",
"library/contextlib.html",
"library/abc.html",
"library/copy.html",
"library/pprint.html",
"library/textwrap.html",
"library/struct.html",
"library/codecs.html",
]
documents = []
for i, page in enumerate(pages[:max_docs]):
try:
url = self.base_url + page
logger.info(f"Fetching {i+1}/{len(pages[:max_docs])}: {page}")
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract page title
title_tag = soup.find('h1')
title = title_tag.get_text() if title_tag else page.split('/')[-1].replace('.html', '')
# Extract main content from documentation
content_div = soup.find('div', class_='body') or soup.find('div', role='main') or soup.find('section', id='tutorial')
if content_div:
# Remove navigation and non-content elements
for tag in content_div.find_all(['script', 'style', 'nav', 'footer']):
tag.decompose()
# Extract text content
content = content_div.get_text(separator='\n', strip=True)
# Clean up excessive whitespace
content = re.sub(r'\n\s*\n', '\n\n', content)
content = re.sub(r' +', ' ', content)
if len(content) > 100:
# Determine module/category
if 'tutorial/' in page:
module = 'tutorial_' + page.split('/')[-1].replace('.html', '')
elif 'reference/' in page:
module = 'reference_' + page.split('/')[-1].replace('.html', '')
else:
module = page.split('/')[-1].replace('.html', '')
documents.append({
'title': title,
'content': content,
'url': url,
'module': module
})
logger.info(f" Collected: {title} ({len(content)} chars)")
# Respectful rate limiting to avoid overwhelming the server
time.sleep(0.5)
except Exception as e:
logger.warning(f" Failed to fetch {page}: {str(e)}")
continue
logger.info(f"Successfully collected {len(documents)} documents")
# Cache the results for future runs
with open(self.cache_path, 'w', encoding='utf-8') as f:
json.dump(documents, f, indent=2)
return documents
# ============================================================================
# DATA PREPROCESSING
# ============================================================================
class DocumentProcessor:
"""
Processes and chunks documents for RAG system.
Implements intelligent chunking that preserves semantic context.
"""
def __init__(self, chunk_size: int = config.chunk_size,
chunk_overlap: int = config.chunk_overlap):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_document(self, text: str) -> List[str]:
"""
Split document into overlapping chunks.
Strategy: Split on paragraph boundaries when possible to preserve semantic context.
Overlapping chunks help maintain continuity across chunk boundaries.
"""
# First split into paragraphs
paragraphs = text.split('\n\n')
chunks = []
current_chunk = ""
for para in paragraphs:
# Check if adding this paragraph would exceed chunk size
if len(current_chunk) + len(para) > self.chunk_size:
if current_chunk:
chunks.append(current_chunk.strip())
# Create overlap by including end of previous chunk
overlap_start = max(0, len(current_chunk) - self.chunk_overlap)
current_chunk = current_chunk[overlap_start:] + "\n\n" + para
else:
# Paragraph itself is larger than chunk size, split by sentences
sentences = para.split('. ')
for sent in sentences:
if len(current_chunk) + len(sent) > self.chunk_size:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sent + '. '
else:
current_chunk += sent + '. '
else:
current_chunk += para + "\n\n"
# Add final chunk
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def process_documents(self, documents: List[Dict]) -> List[Dict]:
"""
Process all documents into chunks with metadata preserved.
Each chunk maintains reference to its source document for attribution.
"""
processed_chunks = []
logger.info("Processing and chunking documents...")
for doc in tqdm(documents, desc="Processing documents"):
chunks = self.chunk_document(doc['content'])
for i, chunk in enumerate(chunks):
processed_chunks.append({
'text': chunk,
'title': doc['title'],
'url': doc['url'],
'module': doc['module'],
'chunk_index': i,
'total_chunks': len(chunks)
})
logger.info(f"Created {len(processed_chunks)} chunks from {len(documents)} documents")
return processed_chunks
# ============================================================================
# TRAINING DATA GENERATION
# ============================================================================
class TrainingDataGenerator:
"""
Generates training data for fine-tuning.
Creates question-answer pairs from documentation chunks to teach the model
how to respond to Python-related queries with appropriate context.
"""
def __init__(self):
# Templates for generating diverse question-answer pairs
self.qa_templates = [
"Question: What is {topic}?\nAnswer: {answer}",
"Question: How do I use {topic}?\nAnswer: {answer}",
"Question: Explain {topic}.\nAnswer: {answer}",
"Question: What does {topic} do?\nAnswer: {answer}",
"Question: Tell me about {topic}.\nAnswer: {answer}",
"Question: How does {topic} work?\nAnswer: {answer}",
"Question: What are the key features of {topic}?\nAnswer: {answer}",
]
def extract_key_concepts(self, text: str) -> List[str]:
"""
Extract key concepts that could be topics for questions.
Focuses on Python functions, classes, modules, and important terminology.
"""
concepts = []
# Extract Python function/method names
identifiers = re.findall(r'\b[a-z_][a-z0-9_]*\(\)', text)
concepts.extend([id.replace('()', '') for id in identifiers[:5]])
# Extract capitalized terms likely to be classes or important concepts
capitalized = re.findall(r'\b[A-Z][a-z]+\w*\b', text)
concepts.extend(capitalized[:4])
# Extract common Python terminology
python_terms = ['list comprehension', 'generator', 'decorator', 'iterator',
'exception', 'context manager', 'lambda', 'module']
for term in python_terms:
if term.lower() in text.lower():
concepts.append(term)
# Remove duplicates while preserving order
seen = set()
unique_concepts = []
for concept in concepts:
if concept not in seen and len(concept) > 2:
seen.add(concept)
unique_concepts.append(concept)
return unique_concepts[:3]
def create_concise_answer(self, text: str, max_length: int = 200) -> str:
"""
Create a concise answer from the text by extracting the most relevant sentences.
Prioritizes sentences that contain key information.
"""
sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]
if not sentences:
return text[:max_length].strip()
# Take first 2-3 sentences for concise answers
answer_sentences = sentences[:min(3, len(sentences))]
answer = '. '.join(answer_sentences) + '.'
# Ensure answer is not too long
if len(answer) > max_length:
answer = answer[:max_length].rsplit('.', 1)[0] + '.'
return answer
def generate_training_samples(self, chunks: List[Dict],
samples_per_chunk: int = 2) -> List[str]:
"""
Generate training samples from document chunks.
Creates question-answer pairs that will be used to fine-tune the model.
Generates multiple samples per chunk to increase training data diversity.
"""
training_texts = []
logger.info("Generating training samples...")
# Process more chunks for better coverage
for chunk in tqdm(chunks[:400], desc="Generating training data"):
text = chunk['text']
if len(text) < 100:
continue
# Extract key concepts from chunk
concepts = self.extract_key_concepts(text)
# If no concepts found, use module name or title
if not concepts:
concepts = [chunk['title'], chunk['module']]
# Generate multiple samples per chunk
for concept in concepts[:samples_per_chunk]:
template = random.choice(self.qa_templates)
answer = self.create_concise_answer(text, max_length=250)
training_text = template.format(
topic=concept,
answer=answer
)
training_texts.append(training_text)
logger.info(f"Generated {len(training_texts)} training samples")
return training_texts
# ============================================================================
# DATA COLLECTION EXECUTION
# ============================================================================
# Collect and process data
collector = PythonDocsCollector()
raw_documents = collector.collect_documentation(max_docs=config.max_documents)
processor = DocumentProcessor()
processed_chunks = processor.process_documents(raw_documents)
generator = TrainingDataGenerator()
training_texts = generator.generate_training_samples(processed_chunks, samples_per_chunk=2)
logger.info(f"Data collection complete: {len(raw_documents)} documents, {len(processed_chunks)} chunks")
# ============================================================================
# VECTOR DATABASE SETUP
# ============================================================================
class VectorDatabase:
"""
ChromaDB-based vector database for document retrieval.
Uses sentence-transformers to create embeddings that capture semantic meaning
for efficient similarity search.
"""
def __init__(self, db_path: str = config.vector_db_path,
embedding_model_name: str = config.embedding_model_name):
self.db_path = db_path
self.embedding_model = SentenceTransformer(embedding_model_name)
# Initialize ChromaDB with persistent storage
self.client = chromadb.PersistentClient(path=db_path)
# Get or create collection
try:
self.collection = self.client.get_collection("python_docs")
logger.info(f"Loaded existing collection with {self.collection.count()} documents")
except:
self.collection = self.client.create_collection(
name="python_docs",
metadata={"description": "Python documentation chunks"}
)
logger.info("Created new vector database collection")
def add_documents(self, chunks: List[Dict]):
"""
Add document chunks to vector database.
Generates embeddings and stores them for efficient semantic search.
"""
if self.collection.count() > 0:
logger.info("Vector database already populated, skipping...")
return
logger.info("Adding documents to vector database...")
texts = [chunk['text'] for chunk in chunks]
metadatas = [{k: v for k, v in chunk.items() if k != 'text'}
for chunk in chunks]
ids = [f"chunk_{i}" for i in range(len(chunks))]
# Generate embeddings for semantic search
logger.info("Generating embeddings...")
embeddings = self.embedding_model.encode(
texts,
show_progress_bar=True,
batch_size=32
).tolist()
# Add to database in batches
batch_size = 100
for i in range(0, len(texts), batch_size):
end_idx = min(i + batch_size, len(texts))
self.collection.add(
embeddings=embeddings[i:end_idx],
documents=texts[i:end_idx],
metadatas=metadatas[i:end_idx],
ids=ids[i:end_idx]
)
logger.info(f"Added {len(texts)} documents to vector database")
def search(self, query: str, top_k: int = config.retrieval_top_k) -> List[Dict]:
"""
Search for relevant documents using semantic similarity.
Returns:
List of dictionaries with text, score, and metadata
"""
# Generate query embedding
query_embedding = self.embedding_model.encode(query).tolist()
# Search for similar documents
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=top_k
)
# Format results
retrieved_docs = []
if results['documents'] and results['documents'][0]:
for i, doc in enumerate(results['documents'][0]):
retrieved_docs.append({
'text': doc,
'score': 1 - results['distances'][0][i],
'metadata': results['metadatas'][0][i] if results['metadatas'] else {}
})
return retrieved_docs
# Initialize and populate vector database
vector_db = VectorDatabase()
vector_db.add_documents(processed_chunks)
# ============================================================================
# MODEL FINE-TUNING
# ============================================================================
class ModelFineTuner:
"""
Fine-tunes GPT-2 model using LoRA (Low-Rank Adaptation).
LoRA reduces trainable parameters from 124M to approximately 1M, enabling
efficient fine-tuning on limited hardware while maintaining performance.
"""
def __init__(self, config: SystemConfig):
self.config = config
self.tokenizer = None
self.model = None
self.trainer = None
def load_base_model(self):
"""
Load base GPT-2 model and tokenizer.
Configures padding tokens and prepares model for training.
"""
logger.info(f"Loading base model: {self.config.base_model_name}")
self.tokenizer = AutoTokenizer.from_pretrained(self.config.base_model_name)
# Set pad token to EOS token for proper padding
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# Load model with appropriate precision
self.model = AutoModelForCausalLM.from_pretrained(
self.config.base_model_name,
torch_dtype=torch.float32
)
# Move to device if GPU available
if torch.cuda.is_available():
self.model = self.model.to(device)
self.model.config.pad_token_id = self.tokenizer.pad_token_id
logger.info(f"Model loaded: {sum(p.numel() for p in self.model.parameters()):,} parameters")
def setup_lora(self):
"""
Configure LoRA for parameter-efficient fine-tuning.
LoRA adds trainable low-rank matrices to attention layers while freezing
the majority of model weights, reducing memory and compute requirements.
"""
logger.info("Setting up LoRA configuration...")
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=self.config.lora_r,
lora_alpha=self.config.lora_alpha,
lora_dropout=self.config.lora_dropout,
target_modules=self.config.lora_target_modules,
bias="none"
)
self.model = get_peft_model(self.model, lora_config)
trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in self.model.parameters())
logger.info(f"LoRA configured:")
logger.info(f" Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
logger.info(f" Total parameters: {total_params:,}")
def prepare_dataset(self, texts: List[str]) -> Dataset:
"""
Tokenize and prepare dataset for training.
Splits data into train and evaluation sets for monitoring overfitting.
"""
logger.info("Preparing training dataset...")
def tokenize_function(examples):
return self.tokenizer(
examples['text'],
truncation=True,
max_length=self.config.max_input_length,
padding='max_length'
)
# Create dataset from text samples
dataset_dict = {'text': texts}
dataset = Dataset.from_dict(dataset_dict)
# Tokenize all samples
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names,
desc="Tokenizing"
)
# Split into train and evaluation sets
split_dataset = tokenized_dataset.train_test_split(
test_size=0.1,
seed=self.config.random_seed
)
logger.info(f"Dataset prepared: {len(split_dataset['train'])} train, {len(split_dataset['test'])} eval")
return split_dataset
def train(self, training_texts: List[str]):
"""
Fine-tune the model using LoRA.
Trains on question-answer pairs to improve Python documentation responses.
"""
logger.info("Starting fine-tuning...")
# Prepare dataset
dataset = self.prepare_dataset(training_texts)
# Training arguments configured for stability and efficiency
training_args = TrainingArguments(
output_dir=self.config.model_save_path,
num_train_epochs=self.config.num_train_epochs,
per_device_train_batch_size=self.config.per_device_train_batch_size,
per_device_eval_batch_size=self.config.per_device_train_batch_size,
gradient_accumulation_steps=self.config.gradient_accumulation_steps,
learning_rate=self.config.learning_rate,
warmup_steps=self.config.warmup_steps,
max_steps=self.config.max_steps,
logging_steps=self.config.logging_steps,
save_steps=self.config.save_steps,
eval_steps=self.config.eval_steps,
eval_strategy="steps",
save_strategy="steps",
load_best_model_at_end=True,
metric_for_best_model="loss",
fp16=False,
report_to="none",
seed=self.config.random_seed,
data_seed=self.config.random_seed,
max_grad_norm=1.0,
)
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False
)
# Initialize trainer
self.trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=dataset['train'],
eval_dataset=dataset['test'],
data_collator=data_collator,
)
# Train the model
logger.info("Training started...")
train_result = self.trainer.train()
logger.info("Training completed!")
logger.info(f"Training loss: {train_result.training_loss:.4f}")
# Save fine-tuned model and tokenizer
self.trainer.save_model()
self.tokenizer.save_pretrained(self.config.model_save_path)
logger.info(f"Model saved to {self.config.model_save_path}")
def load_finetuned_model(self):
"""
Load the fine-tuned model with proper error handling.
Handles both full models and LoRA checkpoints.
"""
if not os.path.exists(self.config.model_save_path):
return False
try:
logger.info(f"Loading fine-tuned model from {self.config.model_save_path}")
# Check if this is a LoRA checkpoint
adapter_config_path = os.path.join(self.config.model_save_path, 'adapter_config.json')
if os.path.exists(adapter_config_path):
# Load base model first
logger.info("Loading base model for LoRA adapter...")
self.tokenizer = AutoTokenizer.from_pretrained(self.config.base_model_name)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
base_model = AutoModelForCausalLM.from_pretrained(
self.config.base_model_name,
torch_dtype=torch.float32
)
# Load LoRA adapter
logger.info("Loading LoRA adapter...")
from peft import PeftModel
self.model = PeftModel.from_pretrained(base_model, self.config.model_save_path)
else:
# Load full fine-tuned model
self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_save_path)
self.model = AutoModelForCausalLM.from_pretrained(
self.config.model_save_path,
torch_dtype=torch.float32
)
# Move to device if GPU available
if torch.cuda.is_available():
self.model = self.model.to(device)
self.model.config.pad_token_id = self.tokenizer.pad_token_id
logger.info("Fine-tuned model loaded successfully")
return True
except Exception as e:
logger.error(f"Failed to load fine-tuned model: {str(e)}")
logger.info("Will retrain the model")
return False
# Fine-tune the model
fine_tuner = ModelFineTuner(config)
# Check if model already exists and is valid
model_loaded = fine_tuner.load_finetuned_model()
if not model_loaded:
logger.info("Starting model fine-tuning process...")
fine_tuner.load_base_model()
fine_tuner.setup_lora()
fine_tuner.train(training_texts)
logger.info("Model ready for inference")
# ============================================================================
# RAG SYSTEM
# ============================================================================
class RAGSystem:
"""
Complete RAG (Retrieval-Augmented Generation) system.
Combines vector retrieval with fine-tuned language model to provide
accurate, grounded responses to Python documentation queries.
"""
def __init__(self, model, tokenizer, vector_db: VectorDatabase, config: SystemConfig):
self.model = model
self.tokenizer = tokenizer
self.vector_db = vector_db
self.config = config
# Statistics tracking for performance monitoring
self.query_count = 0
self.total_latency = 0.0
self.retrieval_stats = []
def retrieve_context(self, query: str) -> Tuple[str, List[Dict]]:
"""
Retrieve relevant context from vector database using semantic search.
Filters results by minimum relevance score to ensure quality.
Returns:
Tuple of formatted context string and list of retrieved documents
"""
retrieved_docs = self.vector_db.search(query, top_k=self.config.retrieval_top_k)
# Filter by minimum relevance score
relevant_docs = [
doc for doc in retrieved_docs
if doc['score'] >= self.config.min_relevance_score
]
if not relevant_docs:
return "", []
# Format context for model input
context_parts = []
for i, doc in enumerate(relevant_docs, 1):
context_parts.append(f"[Source {i}] {doc['text']}")
formatted_context = "\n\n".join(context_parts)
return formatted_context, relevant_docs
def generate_answer(self, query: str, context: str) -> str:
"""
Generate answer using fine-tuned model with retrieved context.
The model is prompted to answer based on the retrieved documentation,
producing concise and accurate responses.
"""
# Construct prompt with context and query
if context:
prompt = f"""Using the information below, provide a clear and concise answer to the question.
{context}
Question: {query}
Answer:"""
else:
prompt = f"""Question: {query}
Answer:"""
# Tokenize input
inputs = self.tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=self.config.max_input_length
)
if torch.cuda.is_available():
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate response
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=self.config.max_new_tokens,
temperature=self.config.temperature,
top_p=self.config.top_p,
top_k=self.config.top_k,
repetition_penalty=self.config.repetition_penalty,
do_sample=True,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id
)
# Decode generated text
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the answer part after "Answer:"
if "Answer:" in generated_text:
answer = generated_text.split("Answer:")[-1].strip()
else:
answer = generated_text.strip()
# Clean up answer
answer = answer.split('\n\n')[0]
answer = answer.split('Question:')[0]
answer = answer.strip()
return answer
def answer_query(self, query: str) -> Dict[str, Any]:
"""
Complete RAG pipeline: retrieve relevant documents and generate answer.
Tracks performance metrics for each query.
Returns:
Dictionary with answer, sources, metrics, and metadata
"""
start_time = time.time()
try:
# Input validation
if not query or len(query.strip()) == 0:
return {
'success': False,
'error': 'Query cannot be empty',
'answer': '',
'sources': [],
'latency_ms': 0
}
if len(query) > 500:
return {
'success': False,
'error': 'Query too long (max 500 characters)',
'answer': '',
'sources': [],
'latency_ms': 0
}
# Retrieve context
retrieval_start = time.time()
context, retrieved_docs = self.retrieve_context(query)
retrieval_time = (time.time() - retrieval_start) * 1000
# Generate answer
generation_start = time.time()
answer = self.generate_answer(query, context)
generation_time = (time.time() - generation_start) * 1000
# Calculate total latency
total_latency = (time.time() - start_time) * 1000
# Update statistics
self.query_count += 1
self.total_latency += total_latency
self.retrieval_stats.append({
'num_retrieved': len(retrieved_docs),
'avg_score': np.mean([d['score'] for d in retrieved_docs]) if retrieved_docs else 0
})
# Format sources
sources = []
for doc in retrieved_docs:
sources.append({
'title': doc['metadata'].get('title', 'Unknown'),
'url': doc['metadata'].get('url', ''),
'relevance_score': round(doc['score'], 3)
})
return {
'success': True,
'answer': answer,
'sources': sources,
'latency_ms': round(total_latency, 1),
'retrieval_time_ms': round(retrieval_time, 1),
'generation_time_ms': round(generation_time, 1),
'num_sources': len(retrieved_docs),
'query_count': self.query_count
}
except Exception as e:
logger.error(f"Error processing query: {str(e)}")
logger.error(traceback.format_exc())
return {
'success': False,
'error': f'Internal error: {str(e)}',
'answer': '',
'sources': [],
'latency_ms': (time.time() - start_time) * 1000
}
def get_statistics(self) -> Dict[str, Any]:
"""Get system performance statistics for monitoring."""
avg_latency = self.total_latency / self.query_count if self.query_count > 0 else 0
avg_sources = np.mean([s['num_retrieved'] for s in self.retrieval_stats]) if self.retrieval_stats else 0
avg_relevance = np.mean([s['avg_score'] for s in self.retrieval_stats]) if self.retrieval_stats else 0
return {
'total_queries': self.query_count,
'avg_latency_ms': round(avg_latency, 1),
'avg_sources_retrieved': round(avg_sources, 1),
'avg_relevance_score': round(avg_relevance, 3)
}
# Initialize RAG system
rag_system = RAGSystem(
model=fine_tuner.model,
tokenizer=fine_tuner.tokenizer,
vector_db=vector_db,
config=config
)
logger.info("RAG system initialized successfully")
# ============================================================================
# EVALUATION FRAMEWORK
# ============================================================================
class EvaluationFramework:
"""
Comprehensive evaluation of RAG system.
Measures retrieval quality, generation quality, and overall performance
using standard metrics like ROUGE and BERTScore.
"""
def __init__(self, rag_system: RAGSystem):
self.rag_system = rag_system
self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
def create_eval_dataset(self, chunks: List[Dict], num_samples: int = 50) -> List[Dict]:
"""
Create evaluation dataset from documentation chunks.
Generates questions and reference answers for quantitative evaluation.
"""
logger.info(f"Creating evaluation dataset with {num_samples} samples...")
eval_samples = []
# Sample random chunks for diverse evaluation
sampled_chunks = random.sample(chunks, min(num_samples, len(chunks)))
for chunk in sampled_chunks:
text = chunk['text']
# Extract meaningful sentences as ground truth
sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]
if not sentences:
continue
# Create questions based on module and title
questions = [
f"What is {chunk['module']}?",
f"How does {chunk['module']} work?",
f"Explain {chunk['title']}",
]
question = random.choice(questions)
# Use first few sentences as reference answer
reference_answer = '. '.join(sentences[:3]) + '.'
eval_samples.append({
'question': question,
'reference_answer': reference_answer,
'context': text,
'module': chunk['module']
})
logger.info(f"Created {len(eval_samples)} evaluation samples")
return eval_samples
def evaluate_retrieval(self, eval_dataset: List[Dict]) -> Dict[str, float]:
"""
Evaluate retrieval quality.
Measures whether the correct documents are retrieved for queries.
"""
logger.info("Evaluating retrieval quality...")
retrieval_scores = []
for sample in tqdm(eval_dataset, desc="Evaluating retrieval"):
query = sample['question']
expected_module = sample['module']
# Retrieve documents
retrieved_docs = self.rag_system.vector_db.search(query, top_k=3)
# Check if correct module is retrieved
retrieved_modules = [doc['metadata'].get('module', '') for doc in retrieved_docs]
# Score: 1 if correct module in top results, 0 otherwise
score = 1.0 if expected_module in retrieved_modules else 0.0
retrieval_scores.append(score)
avg_retrieval_score = np.mean(retrieval_scores)
return {
'retrieval_accuracy': round(avg_retrieval_score, 3),
'samples_evaluated': len(retrieval_scores)
}
def evaluate_generation(self, eval_dataset: List[Dict]) -> Dict[str, float]:
"""
Evaluate generation quality using ROUGE and BERTScore metrics.
ROUGE measures lexical overlap while BERTScore measures semantic similarity.
"""
logger.info("Evaluating generation quality...")
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
bert_scores_f1 = []
generated_answers = []
reference_answers = []
for sample in tqdm(eval_dataset[:20], desc="Evaluating generation"):
query = sample['question']
reference = sample['reference_answer']
# Generate answer
result = self.rag_system.answer_query(query)
if result['success']:
generated = result['answer']
# Calculate ROUGE scores for lexical overlap
rouge_scores = self.rouge_scorer.score(reference, generated)
rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
rougeL_scores.append(rouge_scores['rougeL'].fmeasure)
# Store for BERTScore calculation
generated_answers.append(generated)
reference_answers.append(reference)
# Calculate BERTScore if available
if BERTSCORE_AVAILABLE and generated_answers:
try:
P, R, F1 = bert_score(generated_answers, reference_answers, lang='en', verbose=False)
bert_scores_f1 = F1.tolist()
except Exception as e:
logger.warning(f"BERTScore calculation failed: {e}")
bert_scores_f1 = []
return {
'rouge1_f1': round(np.mean(rouge1_scores), 3) if rouge1_scores else 0.0,
'rouge2_f1': round(np.mean(rouge2_scores), 3) if rouge2_scores else 0.0,
'rougeL_f1': round(np.mean(rougeL_scores), 3) if rougeL_scores else 0.0,
'bertscore_f1': round(np.mean(bert_scores_f1), 3) if bert_scores_f1 else 0.0,
'samples_evaluated': len(rouge1_scores)
}
def run_full_evaluation(self) -> Dict[str, Any]:
"""Run complete evaluation suite and return comprehensive metrics."""
logger.info("=" * 70)
logger.info("Starting comprehensive evaluation")
logger.info("=" * 70)
# Create eval dataset
eval_dataset = self.create_eval_dataset(processed_chunks, num_samples=config.eval_sample_size)
# Evaluate retrieval
retrieval_metrics = self.evaluate_retrieval(eval_dataset)
# Evaluate generation
generation_metrics = self.evaluate_generation(eval_dataset)
# System stats
system_stats = self.rag_system.get_statistics()
results = {
'retrieval_metrics': retrieval_metrics,
'generation_metrics': generation_metrics,
'system_statistics': system_stats,
'evaluation_timestamp': datetime.now().isoformat()
}
logger.info("=" * 70)
logger.info("Evaluation Results:")
logger.info(f" Retrieval Accuracy: {retrieval_metrics['retrieval_accuracy']:.3f}")
logger.info(f" ROUGE-L F1: {generation_metrics['rougeL_f1']:.3f}")
if generation_metrics['bertscore_f1'] > 0:
logger.info(f" BERTScore F1: {generation_metrics['bertscore_f1']:.3f}")
logger.info("=" * 70)
return results
# Run evaluation
evaluator = EvaluationFramework(rag_system)
evaluation_results = evaluator.run_full_evaluation()
# Save evaluation results
eval_results_path = "./evaluation_results.json"
with open(eval_results_path, 'w') as f:
json.dump(evaluation_results, f, indent=2)
logger.info(f"Evaluation results saved to {eval_results_path}")
# ============================================================================
# GRADIO INTERFACE
# ============================================================================
def create_gradio_interface():
"""
Create Gradio interface matching the MLOps project style.
Compact layout with left-aligned text and no large empty spaces.
"""
def process_query(query: str) -> Tuple[str, str]:
"""Process user query and return formatted results."""
if not query or len(query.strip()) == 0:
return "Please enter a question.", ""
# Process query through RAG system
result = rag_system.answer_query(query)
if not result['success']:
error_msg = result.get('error', 'Unknown error occurred')
return f"Error: {error_msg}", ""
# Format answer
answer_text = f"**Answer:** {result['answer']}\n\n"
answer_text += f"**Model Version:** {config.model_save_path}\n"
answer_text += f"**Inference Latency:** {result['latency_ms']:.1f}ms\n"
# Format sources and metrics
metrics_text = f"**Performance Metrics:**\n"
metrics_text += f"- Total Latency: {result['latency_ms']:.1f}ms\n"
metrics_text += f"- Retrieval Time: {result['retrieval_time_ms']:.1f}ms\n"
metrics_text += f"- Generation Time: {result['generation_time_ms']:.1f}ms\n"
metrics_text += f"- Sources Retrieved: {result['num_sources']}\n"
metrics_text += f"- Total Queries Processed: {result['query_count']}\n\n"
if result['sources']:
metrics_text += "**Retrieved Sources:**\n"
for i, source in enumerate(result['sources'], 1):
metrics_text += f"{i}. {source['title']} (Relevance: {source['relevance_score']:.2%})\n"
metrics_text += f" URL: {source['url']}\n"
else:
metrics_text += "No relevant sources found. Answer may be less accurate.\n"
return answer_text, metrics_text
def show_evaluation_results() -> str:
"""Display evaluation results."""
if not evaluation_results:
return "No evaluation results available."
results_text = "**Model Evaluation Results**\n\n"
results_text += "**Retrieval Performance:**\n"
results_text += f"- Retrieval Accuracy: {evaluation_results['retrieval_metrics']['retrieval_accuracy']:.1%}\n"
results_text += f"- Samples Evaluated: {evaluation_results['retrieval_metrics']['samples_evaluated']}\n\n"
results_text += "**Generation Quality:**\n"
results_text += f"- ROUGE-1 F1: {evaluation_results['generation_metrics']['rouge1_f1']:.3f}\n"
results_text += f"- ROUGE-2 F1: {evaluation_results['generation_metrics']['rouge2_f1']:.3f}\n"
results_text += f"- ROUGE-L F1: {evaluation_results['generation_metrics']['rougeL_f1']:.3f}\n"
if evaluation_results['generation_metrics']['bertscore_f1'] > 0:
results_text += f"- BERTScore F1: {evaluation_results['generation_metrics']['bertscore_f1']:.3f}\n"
results_text += f"\n**System Statistics:**\n"
results_text += f"- Total Queries: {evaluation_results['system_statistics']['total_queries']}\n"
results_text += f"- Average Latency: {evaluation_results['system_statistics']['avg_latency_ms']:.1f}ms\n"
results_text += f"- Avg Sources Retrieved: {evaluation_results['system_statistics']['avg_sources_retrieved']:.1f}\n\n"
results_text += f"**Evaluation Date:** {evaluation_results['evaluation_timestamp']}\n\n"
results_text += "**Interpretation:**\n"
results_text += "- ROUGE scores measure overlap with reference answers (0-1, higher is better)\n"
results_text += "- BERTScore measures semantic similarity (0-1, higher is better)\n"
results_text += "- Retrieval accuracy shows percentage of queries where relevant docs were retrieved\n"
return results_text
def show_system_info() -> str:
"""Display system information."""
info_text = "**System Configuration**\n\n"
info_text += "**Model Details:**\n"
info_text += f"- Base Model: {config.base_model_name}\n"
info_text += f"- Fine-tuning: LoRA (Low-Rank Adaptation)\n"
info_text += f"- LoRA Rank: {config.lora_r}\n"
info_text += f"- Training Steps: {config.max_steps}\n"
info_text += f"- Random Seed: {config.random_seed} (for reproducibility)\n\n"
info_text += "**Embedding Model:**\n"
info_text += f"- Model: {config.embedding_model_name}\n"
info_text += f"- Vector Database: ChromaDB\n\n"
info_text += "**Data Source:**\n"
info_text += "- Python 3 Official Documentation\n"
info_text += "- License: PSF License (GPL-compatible)\n"
info_text += "- Source: https://docs.python.org/3/\n"
info_text += f"- Documents Collected: {len(raw_documents)}\n"
info_text += f"- Total Chunks: {len(processed_chunks)}\n\n"
info_text += "**RAG Configuration:**\n"
info_text += f"- Chunk Size: {config.chunk_size} characters\n"
info_text += f"- Chunk Overlap: {config.chunk_overlap} characters\n"
info_text += f"- Retrieval Top-K: {config.retrieval_top_k}\n"
info_text += f"- Min Relevance Score: {config.min_relevance_score}\n\n"
info_text += "**Generation Parameters:**\n"
info_text += f"- Max New Tokens: {config.max_new_tokens}\n"
info_text += f"- Temperature: {config.temperature}\n"
info_text += f"- Top-P: {config.top_p}\n"
info_text += f"- Repetition Penalty: {config.repetition_penalty}\n\n"
info_text += "**Hardware:**\n"
info_text += f"- Device: {device}\n"
info_text += f"- GPU Available: {torch.cuda.is_available()}\n"
if torch.cuda.is_available():
info_text += f"- GPU: {torch.cuda.get_device_name(0)}\n"
return info_text
# Create interface with compact styling
with gr.Blocks(title="Fine-Tuned RAG Framework - Python Documentation Q&A", theme=gr.themes.Soft()) as interface:
gr.Markdown("""
# Fine-Tuned RAG Framework
## Python Documentation Question Answering System
**Author:** Spencer Purdy
**Dataset:** Python 3 Official Documentation
**Model:** GPT-2 with LoRA fine-tuning
This system demonstrates ML engineering skills including data collection, preprocessing,
model fine-tuning, RAG implementation, and comprehensive evaluation.
""")
with gr.Tabs():
with gr.Tab("Ask Questions"):
gr.Markdown("""
### Query Python Documentation
Enter your question about Python's standard library to get an AI-generated answer
based on official documentation.
""")
with gr.Row():
with gr.Column(scale=2):
query_input = gr.Textbox(
label="Question",
placeholder="Example: What is the datetime module used for?",
lines=2
)
query_button = gr.Button("Get Answer", variant="primary")
answer_output = gr.Markdown(label="Answer")
with gr.Column(scale=1):
metrics_output = gr.Markdown(label="Details")
gr.Markdown("### Example Questions")
gr.Examples(
examples=[
["What is the datetime module used for?"],
["How do I read and write JSON files in Python?"],
["Explain list comprehensions in Python"],
["What are the main features of the collections module?"],
["How do I use regular expressions in Python?"],
["What is the difference between os and pathlib?"],
],
inputs=query_input
)
query_button.click(
fn=process_query,
inputs=[query_input],
outputs=[answer_output, metrics_output]
)
query_input.submit(
fn=process_query,
inputs=[query_input],
outputs=[answer_output, metrics_output]
)
gr.Markdown("""
**Important Limitations:**
- Limited to Python 3 standard library documentation
- May not have info on latest Python versions
- Always verify critical information with official docs
- Best for conceptual questions, not version-specific details
""")
with gr.Tab("Model Evaluation"):
gr.Markdown("""
### Comprehensive Model Evaluation
This system has been evaluated using multiple metrics to assess both retrieval
and generation quality.
""")
eval_display = gr.Markdown(value=show_evaluation_results())
gr.Markdown("""
### Known Limitations and Failure Cases
**Retrieval Failures:**
- May not retrieve relevant documents for very specific or niche topics
- Struggles with questions requiring information from multiple disparate sources
- Version-specific questions may return generic information
**Generation Failures:**
- May generate plausible-sounding but incorrect information (hallucination)
- Can be verbose or include irrelevant details
- Sometimes ignores retrieved context in favor of pre-trained knowledge
- May truncate answers due to token limits
**Input Limitations:**
- Maximum query length: 500 characters
- Best performance on clear, focused questions
- Ambiguous questions may produce generic answers
**Data Limitations:**
- Limited to Python standard library (no third-party packages like numpy, pandas)
- Documentation snapshot may be outdated for latest Python versions
- Some modules may have limited coverage
**Always verify critical information with official Python documentation.**
""")
with gr.Tab("System Information"):
gr.Markdown("""
### Technical Details
Complete information about the system architecture, data sources, and configuration.
""")
system_info_display = gr.Markdown(value=show_system_info())
gr.Markdown("""
### Data Attribution and Licensing
**Data Source:**
- Python 3 Official Documentation
- URL: https://docs.python.org/3/
- License: Python Software Foundation License (PSF License)
- The PSF License is GPL-compatible and permits redistribution and modification
**Models Used:**
- GPT-2: OpenAI (MIT License)
- Sentence-Transformers: Apache 2.0 License
**Dependencies:**
- All dependencies are open-source with permissive licenses
### Reproducibility
This system is designed for full reproducibility:
- All random seeds are set (42)
- All hyperparameters are documented
- Training process is deterministic
- Evaluation metrics are computed consistently
To reproduce results:
1. Use the same random seed
2. Use the same model versions
3. Use the same data source
4. Follow the same training procedure
""")
gr.Markdown("""
---
**Fine-Tuned RAG Framework v1.0.0** | Built with Gradio | Author: Spencer Purdy
System demonstrates: Data preprocessing, Feature engineering, Model fine-tuning,
RAG implementation, Comprehensive evaluation, Production monitoring
**Disclaimer:** This system is for educational and demonstrational purposes. Always verify
important information with official Python documentation at https://docs.python.org/3/
""")
return interface
# ============================================================================
# MAIN EXECUTION
# ============================================================================
logger.info("=" * 70)
logger.info("Creating Gradio interface...")
logger.info("=" * 70)
interface = create_gradio_interface()
logger.info("Launching application...")
logger.info("=" * 70)
logger.info("System ready!")
logger.info("Access the interface through the URL below")
logger.info("=" * 70)
# Launch interface with sharing enabled
interface.launch(
share=True,
server_name="0.0.0.0",
server_port=7860,
show_error=True,
quiet=False
)
# ============================================================================
# SYSTEM SUMMARY
# ============================================================================
print("""
================================================================================
FINE-TUNED RAG FRAMEWORK - SETUP COMPLETE
================================================================================
SYSTEM OVERVIEW:
- Fine-tuned GPT-2 model (124M parameters) with LoRA
- {0} Python documentation documents collected
- {1} document chunks in vector database
- {2} training samples generated
- Model evaluation completed
KEY METRICS:
- Retrieval Accuracy: {3:.1%}
- ROUGE-L F1 Score: {4:.3f}
- BERTScore F1: {5:.3f}
- Average Query Latency: {6:.1f}ms
IMPROVEMENTS IN THIS VERSION:
- Expanded documentation collection to {0} documents (from 32)
- Increased to {1} chunks for better coverage
- Lowered relevance threshold to {7} (from 0.2)
- Added tutorial and reference pages for conceptual topics
- Enhanced training data with {2} samples
USAGE EXAMPLES:
1. Ask about Python modules:
"What is the datetime module?"
"How do I use the json module?"
2. Ask about Python concepts:
"Explain list comprehensions"
"What are decorators?"
3. Ask for code guidance:
"How do I read files in Python?"
"How to handle exceptions?"
LIMITATIONS:
- Only covers Python standard library
- Best for Python 3.x (may have gaps for latest versions)
- Always verify critical information with official docs
- Not suitable for production use without further validation
DATA ATTRIBUTION:
- Source: Python 3 Official Documentation (docs.python.org)
- License: PSF License (GPL-compatible)
- All data collection respects robots.txt and rate limits
For more information, see the system documentation in the interface.
================================================================================
""".format(
len(raw_documents),
len(processed_chunks),
len(training_texts),
evaluation_results['retrieval_metrics']['retrieval_accuracy'],
evaluation_results['generation_metrics']['rougeL_f1'],
evaluation_results['generation_metrics']['bertscore_f1'],
evaluation_results['system_statistics']['avg_latency_ms'],
config.min_relevance_score
))
# ============================================================================
# SAVE SYSTEM STATE
# ============================================================================
system_state = {
'config': asdict(config),
'evaluation_results': evaluation_results,
'num_documents': len(raw_documents),
'num_chunks': len(processed_chunks),
'num_training_samples': len(training_texts),
'model_path': config.model_save_path,
'vector_db_path': config.vector_db_path,
'creation_timestamp': datetime.now().isoformat(),
'random_seed': config.random_seed
}
system_state_path = "./system_state.json"
with open(system_state_path, 'w') as f:
json.dump(system_state, f, indent=2)
logger.info(f"System state saved to {system_state_path}")
logger.info("Application is now running. Use Ctrl+C to stop.")