Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,7 +15,9 @@ import io
|
|
| 15 |
import secrets
|
| 16 |
import string
|
| 17 |
from huggingface_hub import HfApi, HfFolder
|
| 18 |
-
import
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# ========== CONFIGURATION ==========
|
| 21 |
PROFILES_DIR = "student_profiles"
|
|
@@ -25,14 +27,45 @@ MIN_AGE = 5
|
|
| 25 |
MAX_AGE = 120
|
| 26 |
SESSION_TOKEN_LENGTH = 32
|
| 27 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 28 |
-
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY") # Add your DeepSeek API key here
|
| 29 |
-
DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions" # Example endpoint
|
| 30 |
|
| 31 |
# Initialize Hugging Face API
|
| 32 |
if HF_TOKEN:
|
| 33 |
hf_api = HfApi(token=HF_TOKEN)
|
| 34 |
HfFolder.save_token(HF_TOKEN)
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
# ========== UTILITY FUNCTIONS ==========
|
| 37 |
def generate_session_token() -> str:
|
| 38 |
"""Generate a random session token for user identification."""
|
|
@@ -77,7 +110,7 @@ def validate_file(file_obj) -> None:
|
|
| 77 |
if file_size > MAX_FILE_SIZE_MB:
|
| 78 |
raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
|
| 79 |
|
| 80 |
-
# ==========
|
| 81 |
def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
| 82 |
"""Enhanced text extraction with better error handling and fallbacks."""
|
| 83 |
text = ""
|
|
@@ -169,60 +202,29 @@ def remove_sensitive_info(text: str) -> str:
|
|
| 169 |
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
|
| 170 |
return text
|
| 171 |
|
| 172 |
-
|
| 173 |
-
"""Extract JSON string from API response."""
|
| 174 |
-
# Handle markdown code blocks
|
| 175 |
-
if '```json' in content:
|
| 176 |
-
content = content.split('```json')[1].split('```')[0].strip()
|
| 177 |
-
elif '```' in content:
|
| 178 |
-
content = content.split('```')[1].split('```')[0].strip()
|
| 179 |
-
|
| 180 |
-
# Sometimes the response is pure JSON
|
| 181 |
-
return content
|
| 182 |
-
|
| 183 |
-
def validate_parsed_data(data: Dict) -> Dict:
|
| 184 |
-
"""Validate and clean the parsed data structure."""
|
| 185 |
-
# Ensure required fields exist
|
| 186 |
-
if not isinstance(data, dict):
|
| 187 |
-
raise ValueError("Invalid data format")
|
| 188 |
-
|
| 189 |
-
# Set default structure if missing
|
| 190 |
-
if 'grade_level' not in data:
|
| 191 |
-
data['grade_level'] = 'Unknown'
|
| 192 |
-
|
| 193 |
-
if 'gpa' not in data:
|
| 194 |
-
data['gpa'] = {'weighted': 'N/A', 'unweighted': 'N/A'}
|
| 195 |
-
|
| 196 |
-
if 'courses' not in data:
|
| 197 |
-
data['courses'] = []
|
| 198 |
-
|
| 199 |
-
# Clean course data
|
| 200 |
-
for course in data['courses']:
|
| 201 |
-
if 'grade' in course:
|
| 202 |
-
course['grade'] = course['grade'].upper().strip()
|
| 203 |
-
|
| 204 |
-
# Ensure numeric credits are strings
|
| 205 |
-
if 'credits' in course and isinstance(course['credits'], (int, float)):
|
| 206 |
-
course['credits'] = str(course['credits'])
|
| 207 |
-
|
| 208 |
-
return data
|
| 209 |
-
|
| 210 |
def parse_transcript_with_deepseek(text: str) -> Dict:
|
| 211 |
-
"""
|
| 212 |
-
if
|
| 213 |
-
raise gr.Error("DeepSeek
|
| 214 |
|
| 215 |
-
# Pre-process the text
|
| 216 |
-
text = remove_sensitive_info(text)
|
| 217 |
|
| 218 |
-
# Create a more robust prompt with examples
|
| 219 |
prompt = f"""
|
| 220 |
-
Analyze this academic transcript and extract structured information
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
{{
|
| 227 |
"grade_level": "11",
|
| 228 |
"gpa": {{
|
|
@@ -240,44 +242,76 @@ def parse_transcript_with_deepseek(text: str) -> Dict:
|
|
| 240 |
}}
|
| 241 |
]
|
| 242 |
}}
|
| 243 |
-
|
| 244 |
Transcript Text:
|
| 245 |
-
{text
|
| 246 |
"""
|
| 247 |
|
| 248 |
-
headers = {
|
| 249 |
-
"Authorization": f"Bearer {DEEPSEEK_API_KEY}",
|
| 250 |
-
"Content-Type": "application/json"
|
| 251 |
-
}
|
| 252 |
-
|
| 253 |
-
payload = {
|
| 254 |
-
"model": "deepseek-chat",
|
| 255 |
-
"messages": [{"role": "user", "content": prompt}],
|
| 256 |
-
"temperature": 0.1,
|
| 257 |
-
"max_tokens": 2000
|
| 258 |
-
}
|
| 259 |
-
|
| 260 |
try:
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
-
|
|
|
|
|
|
|
| 266 |
|
| 267 |
-
# Extract JSON from
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
-
#
|
| 271 |
-
parsed_data =
|
|
|
|
| 272 |
|
| 273 |
-
return parsed_data
|
| 274 |
|
| 275 |
-
except
|
| 276 |
-
raise gr.Error(
|
| 277 |
-
except json.JSONDecodeError as e:
|
| 278 |
-
raise gr.Error(f"Failed to parse API response: {str(e)}")
|
| 279 |
except Exception as e:
|
| 280 |
-
raise gr.Error(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
def format_transcript_output(data: Dict) -> str:
|
| 283 |
"""Format the parsed data into human-readable text."""
|
|
@@ -326,10 +360,10 @@ def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
|
|
| 326 |
# Extract text from file
|
| 327 |
text = extract_text_from_file(file_obj.name, file_ext)
|
| 328 |
|
| 329 |
-
#
|
| 330 |
parsed_data = parse_transcript_with_deepseek(text)
|
| 331 |
|
| 332 |
-
# Format output
|
| 333 |
output_text = format_transcript_output(parsed_data)
|
| 334 |
|
| 335 |
# Prepare the data structure for saving
|
|
@@ -339,7 +373,7 @@ def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
|
|
| 339 |
"courses": defaultdict(list)
|
| 340 |
}
|
| 341 |
|
| 342 |
-
# Organize courses by grade level
|
| 343 |
for course in parsed_data.get('courses', []):
|
| 344 |
grade_level = course.get('grade_level', 'Unknown')
|
| 345 |
transcript_data["courses"][grade_level].append(course)
|
|
@@ -1043,6 +1077,13 @@ def create_interface():
|
|
| 1043 |
background-color: #ffebee;
|
| 1044 |
color: #c62828;
|
| 1045 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1046 |
"""
|
| 1047 |
|
| 1048 |
gr.Markdown("""
|
|
@@ -1051,6 +1092,12 @@ def create_interface():
|
|
| 1051 |
Complete each step to get customized learning recommendations.
|
| 1052 |
""")
|
| 1053 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1054 |
# Progress tracker - now with dynamic styling
|
| 1055 |
with gr.Row():
|
| 1056 |
with gr.Column(scale=1):
|
|
@@ -1101,6 +1148,9 @@ def create_interface():
|
|
| 1101 |
transcript_data = gr.State()
|
| 1102 |
|
| 1103 |
def process_transcript_and_update(file_obj, current_tab_status):
|
|
|
|
|
|
|
|
|
|
| 1104 |
output_text, data = parse_transcript(file_obj)
|
| 1105 |
if "Error" not in output_text:
|
| 1106 |
new_status = current_tab_status.copy()
|
|
@@ -1418,6 +1468,14 @@ def create_interface():
|
|
| 1418 |
inputs=[gr.State(4), tab_completed],
|
| 1419 |
outputs=[tabs, nav_message, quiz_alert]
|
| 1420 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1421 |
|
| 1422 |
return app
|
| 1423 |
|
|
|
|
| 15 |
import secrets
|
| 16 |
import string
|
| 17 |
from huggingface_hub import HfApi, HfFolder
|
| 18 |
+
import torch
|
| 19 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 20 |
+
import time
|
| 21 |
|
| 22 |
# ========== CONFIGURATION ==========
|
| 23 |
PROFILES_DIR = "student_profiles"
|
|
|
|
| 27 |
MAX_AGE = 120
|
| 28 |
SESSION_TOKEN_LENGTH = 32
|
| 29 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
|
|
|
|
|
| 30 |
|
| 31 |
# Initialize Hugging Face API
|
| 32 |
if HF_TOKEN:
|
| 33 |
hf_api = HfApi(token=HF_TOKEN)
|
| 34 |
HfFolder.save_token(HF_TOKEN)
|
| 35 |
|
| 36 |
+
# ========== DEEPSEEK MODEL LOADING ==========
|
| 37 |
+
def load_deepseek_model():
|
| 38 |
+
"""Load the DeepSeek model with progress tracking"""
|
| 39 |
+
progress = gr.Progress()
|
| 40 |
+
progress(0, desc="Loading DeepSeek model...")
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
start_time = time.time()
|
| 44 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 45 |
+
"deepseek-ai/DeepSeek-V3",
|
| 46 |
+
trust_remote_code=True
|
| 47 |
+
)
|
| 48 |
+
progress(0.3, desc="Loading tokenizer...")
|
| 49 |
+
|
| 50 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 51 |
+
"deepseek-ai/DeepSeek-V3",
|
| 52 |
+
trust_remote_code=True,
|
| 53 |
+
torch_dtype=torch.float16,
|
| 54 |
+
device_map="auto"
|
| 55 |
+
)
|
| 56 |
+
progress(0.9, desc="Loading model weights...")
|
| 57 |
+
|
| 58 |
+
load_time = time.time() - start_time
|
| 59 |
+
print(f"DeepSeek model loaded in {load_time:.2f} seconds")
|
| 60 |
+
return model, tokenizer
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"Error loading DeepSeek model: {str(e)}")
|
| 64 |
+
return None, None
|
| 65 |
+
|
| 66 |
+
# Load model at startup
|
| 67 |
+
model, tokenizer = load_deepseek_model()
|
| 68 |
+
|
| 69 |
# ========== UTILITY FUNCTIONS ==========
|
| 70 |
def generate_session_token() -> str:
|
| 71 |
"""Generate a random session token for user identification."""
|
|
|
|
| 110 |
if file_size > MAX_FILE_SIZE_MB:
|
| 111 |
raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
|
| 112 |
|
| 113 |
+
# ========== TEXT EXTRACTION FUNCTIONS ==========
|
| 114 |
def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
| 115 |
"""Enhanced text extraction with better error handling and fallbacks."""
|
| 116 |
text = ""
|
|
|
|
| 202 |
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
|
| 203 |
return text
|
| 204 |
|
| 205 |
+
# ========== TRANSCRIPT PARSING ==========
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
def parse_transcript_with_deepseek(text: str) -> Dict:
|
| 207 |
+
"""Use local DeepSeek model to parse transcript text"""
|
| 208 |
+
if model is None or tokenizer is None:
|
| 209 |
+
raise gr.Error("DeepSeek model failed to load. Please try again later.")
|
| 210 |
|
| 211 |
+
# Pre-process the text
|
| 212 |
+
text = remove_sensitive_info(text[:15000]) # Limit to first 15k chars
|
| 213 |
|
|
|
|
| 214 |
prompt = f"""
|
| 215 |
+
Analyze this academic transcript and extract structured information:
|
| 216 |
+
- Current grade level
|
| 217 |
+
- Weighted GPA (if available)
|
| 218 |
+
- Unweighted GPA (if available)
|
| 219 |
+
- List of all courses with:
|
| 220 |
+
* Course code
|
| 221 |
+
* Course name
|
| 222 |
+
* Grade received
|
| 223 |
+
* Credits earned
|
| 224 |
+
* Year/semester taken
|
| 225 |
+
* Grade level when taken
|
| 226 |
+
|
| 227 |
+
Return the data in this JSON structure:
|
| 228 |
{{
|
| 229 |
"grade_level": "11",
|
| 230 |
"gpa": {{
|
|
|
|
| 242 |
}}
|
| 243 |
]
|
| 244 |
}}
|
| 245 |
+
|
| 246 |
Transcript Text:
|
| 247 |
+
{text}
|
| 248 |
"""
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
try:
|
| 251 |
+
# Show progress to user
|
| 252 |
+
progress = gr.Progress()
|
| 253 |
+
progress(0, desc="Analyzing transcript...")
|
| 254 |
+
|
| 255 |
+
# Tokenize and generate response
|
| 256 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 257 |
+
progress(0.3)
|
| 258 |
+
|
| 259 |
+
outputs = model.generate(
|
| 260 |
+
**inputs,
|
| 261 |
+
max_new_tokens=2000,
|
| 262 |
+
temperature=0.1,
|
| 263 |
+
do_sample=True
|
| 264 |
+
)
|
| 265 |
+
progress(0.8)
|
| 266 |
|
| 267 |
+
# Decode the response
|
| 268 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 269 |
+
progress(0.9)
|
| 270 |
|
| 271 |
+
# Extract the JSON content from the response
|
| 272 |
+
if '```json' in response:
|
| 273 |
+
json_str = response.split('```json')[1].split('```')[0].strip()
|
| 274 |
+
elif '```' in response:
|
| 275 |
+
json_str = response.split('```')[1].split('```')[0].strip()
|
| 276 |
+
else:
|
| 277 |
+
json_str = response
|
| 278 |
|
| 279 |
+
# Parse and validate the JSON
|
| 280 |
+
parsed_data = json.loads(json_str)
|
| 281 |
+
progress(1.0)
|
| 282 |
|
| 283 |
+
return validate_parsed_data(parsed_data)
|
| 284 |
|
| 285 |
+
except torch.cuda.OutOfMemoryError:
|
| 286 |
+
raise gr.Error("The model ran out of memory. Try with a smaller transcript or upgrade your GPU.")
|
|
|
|
|
|
|
| 287 |
except Exception as e:
|
| 288 |
+
raise gr.Error(f"Error processing transcript: {str(e)}")
|
| 289 |
+
|
| 290 |
+
def validate_parsed_data(data: Dict) -> Dict:
|
| 291 |
+
"""Validate and clean the parsed data structure."""
|
| 292 |
+
if not isinstance(data, dict):
|
| 293 |
+
raise ValueError("Invalid data format")
|
| 294 |
+
|
| 295 |
+
# Set default structure if missing
|
| 296 |
+
if 'grade_level' not in data:
|
| 297 |
+
data['grade_level'] = 'Unknown'
|
| 298 |
+
|
| 299 |
+
if 'gpa' not in data:
|
| 300 |
+
data['gpa'] = {'weighted': 'N/A', 'unweighted': 'N/A'}
|
| 301 |
+
|
| 302 |
+
if 'courses' not in data:
|
| 303 |
+
data['courses'] = []
|
| 304 |
+
|
| 305 |
+
# Clean course data
|
| 306 |
+
for course in data['courses']:
|
| 307 |
+
if 'grade' in course:
|
| 308 |
+
course['grade'] = course['grade'].upper().strip()
|
| 309 |
+
|
| 310 |
+
# Ensure numeric credits are strings
|
| 311 |
+
if 'credits' in course and isinstance(course['credits'], (int, float)):
|
| 312 |
+
course['credits'] = str(course['credits'])
|
| 313 |
+
|
| 314 |
+
return data
|
| 315 |
|
| 316 |
def format_transcript_output(data: Dict) -> str:
|
| 317 |
"""Format the parsed data into human-readable text."""
|
|
|
|
| 360 |
# Extract text from file
|
| 361 |
text = extract_text_from_file(file_obj.name, file_ext)
|
| 362 |
|
| 363 |
+
# Use DeepSeek for parsing
|
| 364 |
parsed_data = parse_transcript_with_deepseek(text)
|
| 365 |
|
| 366 |
+
# Format output text
|
| 367 |
output_text = format_transcript_output(parsed_data)
|
| 368 |
|
| 369 |
# Prepare the data structure for saving
|
|
|
|
| 373 |
"courses": defaultdict(list)
|
| 374 |
}
|
| 375 |
|
| 376 |
+
# Organize courses by grade level
|
| 377 |
for course in parsed_data.get('courses', []):
|
| 378 |
grade_level = course.get('grade_level', 'Unknown')
|
| 379 |
transcript_data["courses"][grade_level].append(course)
|
|
|
|
| 1077 |
background-color: #ffebee;
|
| 1078 |
color: #c62828;
|
| 1079 |
}
|
| 1080 |
+
.model-loading {
|
| 1081 |
+
padding: 15px;
|
| 1082 |
+
margin: 15px 0;
|
| 1083 |
+
border-radius: 4px;
|
| 1084 |
+
background-color: #fff3e0;
|
| 1085 |
+
color: #e65100;
|
| 1086 |
+
}
|
| 1087 |
"""
|
| 1088 |
|
| 1089 |
gr.Markdown("""
|
|
|
|
| 1092 |
Complete each step to get customized learning recommendations.
|
| 1093 |
""")
|
| 1094 |
|
| 1095 |
+
# Model loading status
|
| 1096 |
+
model_status = gr.HTML(
|
| 1097 |
+
value="<div class='model-loading'>Loading AI model... (This may take a few minutes)</div>" if model is None else "",
|
| 1098 |
+
visible=model is None
|
| 1099 |
+
)
|
| 1100 |
+
|
| 1101 |
# Progress tracker - now with dynamic styling
|
| 1102 |
with gr.Row():
|
| 1103 |
with gr.Column(scale=1):
|
|
|
|
| 1148 |
transcript_data = gr.State()
|
| 1149 |
|
| 1150 |
def process_transcript_and_update(file_obj, current_tab_status):
|
| 1151 |
+
if model is None:
|
| 1152 |
+
return "Error: AI model failed to load. Please try again later.", None, current_tab_status, gr.update(), gr.update(), gr.update()
|
| 1153 |
+
|
| 1154 |
output_text, data = parse_transcript(file_obj)
|
| 1155 |
if "Error" not in output_text:
|
| 1156 |
new_status = current_tab_status.copy()
|
|
|
|
| 1468 |
inputs=[gr.State(4), tab_completed],
|
| 1469 |
outputs=[tabs, nav_message, quiz_alert]
|
| 1470 |
)
|
| 1471 |
+
|
| 1472 |
+
# Check model loading status periodically
|
| 1473 |
+
def check_model_status():
|
| 1474 |
+
if model is not None and tokenizer is not None:
|
| 1475 |
+
return gr.update(visible=False)
|
| 1476 |
+
return gr.update(visible=True)
|
| 1477 |
+
|
| 1478 |
+
app.load(check_model_status, None, model_status, every=1)
|
| 1479 |
|
| 1480 |
return app
|
| 1481 |
|