hamxaameer commited on
Commit
0958ebc
·
verified ·
1 Parent(s): a52cd7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -32
app.py CHANGED
@@ -377,56 +377,103 @@ def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p
377
  generated_codes.append(f"# Generation {i+1} failed: No valid tokens")
378
  continue
379
 
380
- # Decode with comprehensive error handling
381
  try:
382
- # First attempt: decode with skip_special_tokens=False
383
- generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=False)
384
 
385
- # Check if decode returned None or contains None
386
  if generated is None:
387
  raise ValueError("Tokenizer decode returned None")
388
 
389
- # Check for None in the string (shouldn't happen but be safe)
390
- if 'None' in str(generated) or '\x00' in str(generated):
391
- raise ValueError("Decoded string contains invalid characters")
 
 
 
 
 
392
 
393
  except Exception as decode_error:
394
  # Second attempt: decode with skip_special_tokens=True
395
  try:
396
- generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=True)
397
  if generated is None:
398
  raise ValueError("Tokenizer decode (skip_special) returned None")
399
- if 'None' in str(generated) or '\x00' in str(generated):
400
- raise ValueError("Decoded string contains invalid characters")
 
 
 
 
 
 
 
 
401
  except Exception as decode_error2:
402
- # Third attempt: manual token-to-string conversion
403
  try:
404
- # Convert tokens to string manually using vocab
405
- if hasattr(loaded_tokenizer, 'get_vocab'):
406
- vocab = loaded_tokenizer.get_vocab()
407
- inv_vocab = {v: k for k, v in vocab.items()}
408
-
409
- # Convert tokens to strings, skip unknown tokens
410
- token_strings = []
411
  for token_id in valid_tokens:
412
- if token_id in inv_vocab:
413
- token_str = inv_vocab[token_id]
414
- # Skip special tokens that might cause issues
415
- if token_str not in ['<pad>', '<unk>', '<mask>', '<s>', '</s>', '<PAD>', '<SEP>', '<CODE>', '<PSEUDO>']:
416
- token_strings.append(token_str)
417
-
418
- generated = ''.join(token_strings)
419
 
420
- if not generated or generated.isspace():
421
- raise ValueError("Manual conversion produced empty string")
 
 
 
 
 
 
 
 
 
 
 
422
  else:
423
- raise ValueError("Tokenizer has no get_vocab method")
424
 
425
- except Exception as manual_error:
426
- # Final fallback: create a safe representation
427
- generated = f"# Decode failed: {str(decode_error)}\n# Manual conversion failed: {str(manual_error)}\n# Raw tokens: {valid_tokens[:10]}..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
- # Final safety check: ensure we have a string
430
  if not isinstance(generated, str):
431
  generated = str(generated) if generated is not None else "# Decode returned non-string object"
432
 
 
377
  generated_codes.append(f"# Generation {i+1} failed: No valid tokens")
378
  continue
379
 
380
+ # Decode with GPT-2 compatible handling
381
  try:
382
+ # First attempt: standard decode with proper cleanup
383
+ generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True)
384
 
385
+ # GPT-2 specific: handle byte-level tokens properly
386
  if generated is None:
387
  raise ValueError("Tokenizer decode returned None")
388
 
389
+ # Clean up common GPT-2 artifacts
390
+ generated = generated.replace('Ġ', ' ').replace('▁', ' ') # Handle different space tokens
391
+ generated = ' '.join(generated.split()) # Normalize whitespace
392
+
393
+ # Check for gibberish (too many special characters)
394
+ special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"') / max(len(generated), 1)
395
+ if special_ratio > 0.5: # More than 50% special chars = likely gibberish
396
+ raise ValueError("Decoded output appears to be gibberish")
397
 
398
  except Exception as decode_error:
399
  # Second attempt: decode with skip_special_tokens=True
400
  try:
401
+ generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
402
  if generated is None:
403
  raise ValueError("Tokenizer decode (skip_special) returned None")
404
+
405
+ # Clean up GPT-2 artifacts
406
+ generated = generated.replace('Ġ', ' ').replace('▁', ' ')
407
+ generated = ' '.join(generated.split())
408
+
409
+ # Check for gibberish again
410
+ special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"') / max(len(generated), 1)
411
+ if special_ratio > 0.5:
412
+ raise ValueError("Decoded output still appears to be gibberish")
413
+
414
  except Exception as decode_error2:
415
+ # Third attempt: manual byte-level decoding for GPT-2
416
  try:
417
+ # GPT-2 uses byte-level BPE, so we need to decode bytes properly
418
+ if hasattr(loaded_tokenizer, 'byte_decoder'):
419
+ # Use the tokenizer's byte decoder
420
+ byte_tokens = []
 
 
 
421
  for token_id in valid_tokens:
422
+ if token_id in loaded_tokenizer.decoder:
423
+ token_bytes = loaded_tokenizer.decoder[token_id]
424
+ if isinstance(token_bytes, bytes):
425
+ byte_tokens.append(token_bytes)
426
+ elif isinstance(token_bytes, str):
427
+ byte_tokens.append(token_bytes.encode('utf-8', errors='ignore'))
 
428
 
429
+ if byte_tokens:
430
+ # Decode the byte sequence
431
+ full_bytes = b''.join(byte_tokens)
432
+ generated = full_bytes.decode('utf-8', errors='replace')
433
+
434
+ # Clean up
435
+ generated = generated.replace('Ġ', ' ').replace('▁', ' ')
436
+ generated = ' '.join(generated.split())
437
+
438
+ if not generated or generated.isspace():
439
+ raise ValueError("Byte decoding produced empty result")
440
+ else:
441
+ raise ValueError("No valid byte tokens found")
442
  else:
443
+ raise ValueError("Tokenizer has no byte_decoder")
444
 
445
+ except Exception as byte_error:
446
+ # Fourth attempt: fallback to vocab-based conversion
447
+ try:
448
+ if hasattr(loaded_tokenizer, 'get_vocab'):
449
+ vocab = loaded_tokenizer.get_vocab()
450
+
451
+ # Convert tokens, handling byte-level tokens
452
+ text_parts = []
453
+ for token_id in valid_tokens:
454
+ if token_id in vocab:
455
+ token_text = vocab[token_id]
456
+ # Handle byte-level tokens (start with Ġ or ▁)
457
+ if token_text.startswith('Ġ'):
458
+ text_parts.append(' ' + token_text[1:])
459
+ elif token_text.startswith('▁'):
460
+ text_parts.append(' ' + token_text[1:])
461
+ else:
462
+ text_parts.append(token_text)
463
+
464
+ generated = ''.join(text_parts)
465
+ generated = ' '.join(generated.split()) # Clean whitespace
466
+
467
+ if not generated or generated.isspace():
468
+ raise ValueError("Vocab conversion produced empty result")
469
+ else:
470
+ raise ValueError("Tokenizer has no get_vocab method")
471
+
472
+ except Exception as vocab_error:
473
+ # Final fallback: show what we have
474
+ generated = f"# Decode failed: {str(decode_error)}\n# Byte decode failed: {str(byte_error)}\n# Vocab decode failed: {str(vocab_error)}\n# Raw tokens: {valid_tokens[:10]}..."
475
 
476
+ # Final safety check
477
  if not isinstance(generated, str):
478
  generated = str(generated) if generated is not None else "# Decode returned non-string object"
479