Text Generation
Transformers
English
qwen2
code-generation
python
fine-tuning
Qwen
tools
agent-framework
multi-agent
conversational
Eval Results (legacy)
Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use my-ai-stack/Stack-2-9-finetuned with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned") model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use my-ai-stack/Stack-2-9-finetuned with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "my-ai-stack/Stack-2-9-finetuned" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
- SGLang
How to use my-ai-stack/Stack-2-9-finetuned with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
| """ | |
| Conversation quality evaluation for Stack 2.9 | |
| Measures context retention, multi-turn coherence, error recovery, and user satisfaction | |
| """ | |
| import json | |
| from typing import Dict, List, Any, Tuple | |
| from datetime import datetime, timedelta | |
| import random | |
| class ConversationQualityEvaluator: | |
| def __init__(self, conversation_history_path: str = "conversations.json"): | |
| self.conversation_history_path = conversation_history_path | |
| self.conversations = self._load_conversations() | |
| self.results = {} | |
| def _load_conversations(self) -> List[Dict]: | |
| """Load conversation history""" | |
| try: | |
| with open(self.conversation_history_path, 'r') as f: | |
| return json.load(f) | |
| except FileNotFoundError: | |
| print(f"Conversation history not found at {self.conversation_history_path}") | |
| return [] | |
| except json.JSONDecodeError: | |
| print(f"Error parsing conversation history") | |
| return [] | |
| def evaluate_conversations(self) -> Dict[str, Any]: | |
| """Evaluate all conversations""" | |
| print("Evaluating conversation quality...") | |
| if not self.conversations: | |
| print("No conversations found for evaluation") | |
| return {} | |
| total_conversations = len(self.conversations) | |
| print(f"Evaluating {total_conversations} conversations") | |
| context_retention_scores = [] | |
| coherence_scores = [] | |
| error_recovery_scores = [] | |
| satisfaction_scores = [] | |
| for i, conversation in enumerate(self.conversations): | |
| print(f"Evaluating conversation {i+1}/{total_conversations}...") | |
| scores = self._evaluate_single_conversation(conversation) | |
| context_retention_scores.append(scores["context_retention"]) | |
| coherence_scores.append(scores["coherence"]) | |
| error_recovery_scores.append(scores["error_recovery"]) | |
| satisfaction_scores.append(scores["satisfaction"]) | |
| return { | |
| "summary": { | |
| "total_conversations": total_conversations, | |
| "average_context_retention": self._calculate_average(context_retention_scores), | |
| "average_coherence": self._calculate_average(coherence_scores), | |
| "average_error_recovery": self._calculate_average(error_recovery_scores), | |
| "average_satisfaction": self._calculate_average(satisfaction_scores) | |
| }, | |
| "detailed_results": self.results | |
| } | |
| def _evaluate_single_conversation(self, conversation: Dict) -> Dict[str, float]: | |
| """Evaluate a single conversation""" | |
| conversation_id = conversation.get("id", str(random.randint(1000, 9999))) | |
| # Measure context retention | |
| context_retention = self._measure_context_retention(conversation) | |
| # Measure multi-turn coherence | |
| coherence = self._measure_coherence(conversation) | |
| # Measure error recovery | |
| error_recovery = self._measure_error_recovery(conversation) | |
| # Measure user satisfaction (proxy metrics) | |
| satisfaction = self._measure_satisfaction(conversation) | |
| self.results[conversation_id] = { | |
| "context_retention": context_retention, | |
| "coherence": coherence, | |
| "error_recovery": error_recovery, | |
| "satisfaction": satisfaction, | |
| "message_count": len(conversation.get("messages", [])), | |
| "duration_minutes": self._calculate_conversation_duration(conversation) | |
| } | |
| return { | |
| "context_retention": context_retention, | |
| "coherence": coherence, | |
| "error_recovery": error_recovery, | |
| "satisfaction": satisfaction | |
| } | |
| def _measure_context_retention(self, conversation: Dict) -> float: | |
| """Measure how well the model retains context""" | |
| messages = conversation.get("messages", []) | |
| if len(messages) < 3: | |
| return 1.0 # Not enough context to evaluate | |
| # Check if later messages reference earlier context | |
| retention_score = 0 | |
| reference_count = 0 | |
| # Look for references to earlier messages | |
| for i in range(len(messages) - 1, 1, -1): | |
| current_message = messages[i] | |
| earlier_messages = messages[:i] | |
| # Check if current message references earlier context | |
| if self._contains_reference(current_message, earlier_messages): | |
| retention_score += 1 | |
| reference_count += 1 | |
| return retention_score / (len(messages) - 2) if len(messages) > 2 else 1.0 | |
| def _contains_reference(self, message: Dict, earlier_messages: List[Dict]) -> bool: | |
| """Check if message contains reference to earlier messages""" | |
| content = message.get("content", "").lower() | |
| # Check for explicit references | |
| if "as mentioned" in content or "earlier" in content or "before" in content: | |
| return True | |
| # Check for topic continuity | |
| for earlier in earlier_messages[-3:]: # Check last 3 messages | |
| earlier_content = earlier.get("content", "").lower() | |
| if any(keyword in content for keyword in [earlier_content[:20], earlier_content.split()[0]]): | |
| return True | |
| return False | |
| def _measure_coherence(self, conversation: Dict) -> float: | |
| """Measure multi-turn coherence""" | |
| messages = conversation.get("messages", []) | |
| if len(messages) < 2: | |
| return 1.0 | |
| coherence_breaks = 0 | |
| for i in range(1, len(messages)): | |
| prev_message = messages[i-1] | |
| current_message = messages[i] | |
| # Check if current message is on-topic with previous | |
| if not self._is_coherent(prev_message, current_message): | |
| coherence_breaks += 1 | |
| return 1.0 - (coherence_breaks / (len(messages) - 1)) if len(messages) > 1 else 1.0 | |
| def _is_coherent(self, message1: Dict, message2: Dict) -> bool: | |
| """Check if two messages are coherent""" | |
| content1 = message1.get("content", "").lower() | |
| content2 = message2.get("content", "").lower() | |
| # Check for topic similarity | |
| common_words = set(content1.split()) & set(content2.split()) | |
| # If they share at least one significant word, consider coherent | |
| significant_words = {w for w in common_words if len(w) > 3} | |
| return len(significant_words) > 0 | |
| def _measure_error_recovery(self, conversation: Dict) -> float: | |
| """Measure error recovery capability""" | |
| messages = conversation.get("messages", []) | |
| if len(messages) < 3: | |
| return 1.0 | |
| error_recovery_count = 0 | |
| # Look for error patterns and recovery | |
| for i in range(1, len(messages)): | |
| prev_message = messages[i-1] | |
| current_message = messages[i] | |
| # Check if current message corrects or recovers from previous error | |
| if self._is_error_recovery(prev_message, current_message): | |
| error_recovery_count += 1 | |
| return error_recovery_count / (len(messages) - 1) if len(messages) > 1 else 1.0 | |
| def _is_error_recovery(self, message1: Dict, message2: Dict) -> bool: | |
| """Check if message2 recovers from error in message1""" | |
| content1 = message1.get("content", "").lower() | |
| content2 = message2.get("content", "").lower() | |
| # Check for correction patterns | |
| corrections = [ | |
| "correction:", "actually", "sorry", "correction", "correction to", | |
| "i meant", "meant to say", "correction -", "correction--" | |
| ] | |
| return any(correction in content2 for correction in corrections) | |
| def _measure_satisfaction(self, conversation: Dict) -> float: | |
| """Measure user satisfaction (proxy metrics)""" | |
| messages = conversation.get("messages", []) | |
| if not messages: | |
| return 0.0 | |
| # Check for positive sentiment in user messages | |
| positive_indicators = 0 | |
| for message in messages: | |
| if message.get("role") == "user": | |
| content = message.get("content", "").lower() | |
| positive_words = [ | |
| "thanks", "thank you", "great", "good", "excellent", | |
| "perfect", "awesome", "wonderful", "love", "amazing" | |
| ] | |
| if any(word in content for word in positive_words): | |
| positive_indicators += 1 | |
| # Check conversation length (longer conversations often indicate satisfaction) | |
| conversation_length = len(messages) | |
| # Combine metrics | |
| satisfaction_score = (positive_indicators / len(messages)) * 0.5 + \ | |
| (min(conversation_length, 20) / 20) * 0.5 | |
| return satisfaction_score | |
| def _calculate_conversation_duration(self, conversation: Dict) -> float: | |
| """Calculate conversation duration in minutes""" | |
| messages = conversation.get("messages", []) | |
| if len(messages) < 2: | |
| return 0.0 | |
| try: | |
| start_time = datetime.fromisoformat(messages[0]["timestamp"].replace("Z", "")) | |
| end_time = datetime.fromisoformat(messages[-1]["timestamp"].replace("Z", "")) | |
| duration = end_time - start_time | |
| return duration.total_seconds() / 60.0 | |
| except: | |
| return 0.0 | |
| def _calculate_average(self, scores: List[float]) -> float: | |
| """Calculate average of scores""" | |
| return sum(scores) / len(scores) if scores else 0.0 | |
| def generate_report(self) -> str: | |
| """Generate markdown report""" | |
| results = self.evaluate_conversations() | |
| summary = results.get("summary", {}) | |
| report = f"""# Conversation Quality Evaluation Report | |
| ## Summary | |
| Evaluation of conversation quality for Stack 2.9. | |
| ## Overall Statistics | |
| | Metric | Value | | |
| |--------|-------| | |
| | Total Conversations | {summary[\"total_conversations\"]} | | |
| | Average Context Retention | {summary[\"average_context_retention\"]:.2%} | | |
| | Average Coherence | {summary[\"average_coherence\"]:.2%} | | |
| | Average Error Recovery | {summary[\"average_error_recovery\"]:.2%} | | |
| | Average Satisfaction | {summary[\"average_satisfaction\"]:.2%} | | |
| ## Conversation Details | |
| """ | |
| for conv_id, result in self.results.items(): | |
| report += f"""### Conversation {conv_id} | |
| - **Messages**: {result[\"message_count\"]} | |
| - **Duration**: {result[\"duration_minutes\"]:.1f} minutes | |
| - **Context Retention**: {result[\"context_retention\"]:.2%} | |
| - **Coherence**: {result[\"coherence\"]:.2%} | |
| - **Error Recovery**: {result[\"error_recovery\"]:.2%} | |
| - **Satisfaction**: {result[\"satisfaction\"]:.2%} | |
| """ | |
| return report | |
| if __name__ == "__main__": | |
| evaluator = ConversationQualityEvaluator() | |
| results = evaluator.evaluate_conversations() | |
| print("Conversation Quality Evaluation Complete!") | |
| print(json.dumps(results, indent=2)) | |
| report = evaluator.generate_report() | |
| print(report) | |
| # Save results | |
| with open("results/conversation_quality_evaluation.json", 'w') as f: | |
| json.dump(results, f, indent=2) | |
| with open("results/conversation_quality_report.md", 'w') as f: | |
| f.write(report) |