david167 commited on
Commit
6ea58d5
·
1 Parent(s): 7629837

Upgrade to Llama 3.1 8B-Instruct for better long-form content

Browse files

- Switch from Mistral-7B to Llama-3.1-8B-Instruct
- 4x larger context window (32K → 128K tokens)
- Better reasoning and question generation quality
- Same speed and memory usage
- Perfect for long-form content interpretation

Files changed (3) hide show
  1. app.py +3 -3
  2. gradio_app.py +2 -2
  3. upgrade_models.py +75 -0
app.py CHANGED
@@ -122,8 +122,8 @@ async def load_model():
122
  try:
123
  logger.info("Loading model with transformers...")
124
 
125
- # Use Mistral 7B Instruct - 30-40% faster with same quality
126
- base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"
127
 
128
  tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
129
 
@@ -337,7 +337,7 @@ async def generate_questions(request: QuestionGenerationRequest):
337
  questions.append(f"What is the main point of this statement: '{request.statement[:100]}...'?")
338
 
339
  metadata = {
340
- "model": "mistralai/Mistral-7B-Instruct-v0.2",
341
  "temperature": request.temperature,
342
  "difficulty_level": request.difficulty_level,
343
  "generated_text_length": len(generated_text),
 
122
  try:
123
  logger.info("Loading model with transformers...")
124
 
125
+ # Use Llama 3.1 8B Instruct - 4x context window, better reasoning
126
+ base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
127
 
128
  tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
129
 
 
337
  questions.append(f"What is the main point of this statement: '{request.statement[:100]}...'?")
338
 
339
  metadata = {
340
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
341
  "temperature": request.temperature,
342
  "difficulty_level": request.difficulty_level,
343
  "generated_text_length": len(generated_text),
gradio_app.py CHANGED
@@ -38,8 +38,8 @@ class ModelManager:
38
  # Get HF token from environment
39
  hf_token = os.getenv("HF_TOKEN")
40
 
41
- logger.info("Loading Mistral-7B-Instruct-v0.2 model...")
42
- base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"
43
 
44
  self.tokenizer = AutoTokenizer.from_pretrained(
45
  base_model_name,
 
38
  # Get HF token from environment
39
  hf_token = os.getenv("HF_TOKEN")
40
 
41
+ logger.info("Loading Llama-3.1-8B-Instruct model...")
42
+ base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
43
 
44
  self.tokenizer = AutoTokenizer.from_pretrained(
45
  base_model_name,
upgrade_models.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model upgrade options for better long-form content interpretation
2
+
3
+ # OPTION 1: Llama 3.1 70B (Best Quality - if you have compute)
4
+ LLAMA_70B = {
5
+ "model_name": "meta-llama/Llama-3.1-70B-Instruct",
6
+ "context_window": "128K tokens",
7
+ "quality": "Excellent - best for complex content",
8
+ "speed": "Moderate (2-4x slower than 7B)",
9
+ "memory_required": "~35GB VRAM",
10
+ "fits_on_a100": True,
11
+ "upgrade_difficulty": "Easy - just change model name"
12
+ }
13
+
14
+ # OPTION 2: Qwen2.5-32B (Best Balance)
15
+ QWEN_32B = {
16
+ "model_name": "Qwen/Qwen2.5-32B-Instruct",
17
+ "context_window": "128K tokens",
18
+ "quality": "Excellent - specialized for reasoning",
19
+ "speed": "Fast (1.5-2x slower than 7B)",
20
+ "memory_required": "~16GB VRAM",
21
+ "fits_on_a100": True,
22
+ "upgrade_difficulty": "Easy - just change model name"
23
+ }
24
+
25
+ # OPTION 3: Llama 3.1 8B (Easy Upgrade)
26
+ LLAMA_8B = {
27
+ "model_name": "meta-llama/Llama-3.1-8B-Instruct",
28
+ "context_window": "128K tokens",
29
+ "quality": "Very good - better than Mistral-7B",
30
+ "speed": "Fast (similar to current)",
31
+ "memory_required": "~8GB VRAM",
32
+ "fits_on_a100": True,
33
+ "upgrade_difficulty": "Trivial - just change model name"
34
+ }
35
+
36
+ # OPTION 4: Claude 3.5 Sonnet via API (Best Overall)
37
+ CLAUDE_API = {
38
+ "model_name": "claude-3-5-sonnet-20241022",
39
+ "context_window": "200K tokens",
40
+ "quality": "Excellent - best for nuanced questions",
41
+ "speed": "Very fast via API",
42
+ "memory_required": "0GB (API-based)",
43
+ "cost": "$3 per million input tokens",
44
+ "upgrade_difficulty": "Medium - requires API integration"
45
+ }
46
+
47
+ def get_recommended_upgrade():
48
+ """Get the best upgrade based on priorities"""
49
+
50
+ recommendations = {
51
+ "best_quality": LLAMA_70B,
52
+ "best_balance": QWEN_32B,
53
+ "easiest_upgrade": LLAMA_8B,
54
+ "best_overall": CLAUDE_API
55
+ }
56
+
57
+ return recommendations
58
+
59
+ # Context window comparison
60
+ CONTEXT_COMPARISON = {
61
+ "Current Mistral-7B": "32K tokens",
62
+ "Llama 3.1 8B": "128K tokens (4x more)",
63
+ "Llama 3.1 70B": "128K tokens (4x more)",
64
+ "Qwen2.5-32B": "128K tokens (4x more)",
65
+ "Claude 3.5 Sonnet": "200K tokens (6x more)"
66
+ }
67
+
68
+ # Performance for long-form content
69
+ LONG_FORM_PERFORMANCE = {
70
+ "Mistral-7B": "Good for simple questions",
71
+ "Llama 3.1 8B": "Better reasoning, longer context",
72
+ "Qwen2.5-32B": "Excellent reasoning, great for complex content",
73
+ "Llama 3.1 70B": "Superior understanding, best for nuanced questions",
74
+ "Claude 3.5 Sonnet": "Best overall, excellent at context understanding"
75
+ }