ejschwartz commited on
Commit
ad23357
·
1 Parent(s): 7bba3ba

More sophisticated attempt at fixing tabs

Browse files
Files changed (1) hide show
  1. app.py +42 -2
app.py CHANGED
@@ -13,6 +13,44 @@ from normalize import normalize
13
  from transformers import AutoTokenizer
14
  from modeling_nova import NovaTokenizer, NovaForCausalLM
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  print("Downloading model")
17
 
18
  tokenizer = AutoTokenizer.from_pretrained(
@@ -33,6 +71,7 @@ examples = json.load(open("humaneval_decompile_nova_6.7b.json", "r"))
33
  def predict(type, input_asm, _c_source):
34
 
35
  if "<func0>:" not in input_asm:
 
36
 
37
  # Add a bogus function header if needed.
38
  first_line = input_asm.split("\n")[0]
@@ -40,8 +79,9 @@ def predict(type, input_asm, _c_source):
40
  print("Adding synthetic function header")
41
  input_asm = "<func0>:\n" + input_asm
42
 
43
- # Convert consecutive spaces to tabs
44
- input_asm = re.sub(r' {2,}', '\t', input_asm)
 
45
  normalized_asm = normalize(input_asm)
46
  else:
47
  normalized_asm = input_asm
 
13
  from transformers import AutoTokenizer
14
  from modeling_nova import NovaTokenizer, NovaForCausalLM
15
 
16
+
17
+ def fix_assembly_tabs(asm_text):
18
+ """
19
+ Fix assembly code formatting by ensuring proper tab placement.
20
+ Expected format: address:TABhex_bytesWHITESPACEinstructionWHITESPACEoperands
21
+ """
22
+ lines = asm_text.split("\n")
23
+ fixed_lines = []
24
+
25
+ for line in lines:
26
+ line = line.rstrip() # Remove trailing whitespace
27
+ if not line.strip(): # Skip empty lines
28
+ fixed_lines.append(line)
29
+ continue
30
+
31
+ # Check if this looks like an assembly instruction line
32
+ # Pattern: optional_spaces + hex_address + colon + hex_bytes + instruction + operands
33
+ asm_pattern = r"^(\s*)([0-9a-f]+):\s*([0-9a-f\s]+?)\s+(\w+)(\s+.*)?$"
34
+ match = re.match(asm_pattern, line, re.IGNORECASE)
35
+
36
+ if match:
37
+ indent, address, hex_bytes, instruction, operands = match.groups()
38
+ operands = operands or ""
39
+
40
+ # Clean up hex bytes (remove extra spaces)
41
+ hex_bytes = re.sub(r"\s+", " ", hex_bytes.strip())
42
+
43
+ # Reconstruct with proper tab formatting
44
+ # Format: indent + address + ":" + TAB + hex_bytes + TAB + instruction + operands
45
+ fixed_line = f"{indent}{address}:\t{hex_bytes}\t{instruction}{operands}"
46
+ fixed_lines.append(fixed_line)
47
+ else:
48
+ # Not an assembly instruction line, keep as is
49
+ fixed_lines.append(line)
50
+
51
+ return "\n".join(fixed_lines)
52
+
53
+
54
  print("Downloading model")
55
 
56
  tokenizer = AutoTokenizer.from_pretrained(
 
71
  def predict(type, input_asm, _c_source):
72
 
73
  if "<func0>:" not in input_asm:
74
+ # Needs normalizing
75
 
76
  # Add a bogus function header if needed.
77
  first_line = input_asm.split("\n")[0]
 
79
  print("Adding synthetic function header")
80
  input_asm = "<func0>:\n" + input_asm
81
 
82
+ # Fix tab formatting in assembly code
83
+ input_asm = fix_assembly_tabs(input_asm)
84
+
85
  normalized_asm = normalize(input_asm)
86
  else:
87
  normalized_asm = input_asm