Spaces:
Sleeping
Sleeping
Commit
·
ad23357
1
Parent(s):
7bba3ba
More sophisticated attempt at fixing tabs
Browse files
app.py
CHANGED
|
@@ -13,6 +13,44 @@ from normalize import normalize
|
|
| 13 |
from transformers import AutoTokenizer
|
| 14 |
from modeling_nova import NovaTokenizer, NovaForCausalLM
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
print("Downloading model")
|
| 17 |
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained(
|
|
@@ -33,6 +71,7 @@ examples = json.load(open("humaneval_decompile_nova_6.7b.json", "r"))
|
|
| 33 |
def predict(type, input_asm, _c_source):
|
| 34 |
|
| 35 |
if "<func0>:" not in input_asm:
|
|
|
|
| 36 |
|
| 37 |
# Add a bogus function header if needed.
|
| 38 |
first_line = input_asm.split("\n")[0]
|
|
@@ -40,8 +79,9 @@ def predict(type, input_asm, _c_source):
|
|
| 40 |
print("Adding synthetic function header")
|
| 41 |
input_asm = "<func0>:\n" + input_asm
|
| 42 |
|
| 43 |
-
#
|
| 44 |
-
input_asm =
|
|
|
|
| 45 |
normalized_asm = normalize(input_asm)
|
| 46 |
else:
|
| 47 |
normalized_asm = input_asm
|
|
|
|
| 13 |
from transformers import AutoTokenizer
|
| 14 |
from modeling_nova import NovaTokenizer, NovaForCausalLM
|
| 15 |
|
| 16 |
+
|
| 17 |
+
def fix_assembly_tabs(asm_text):
|
| 18 |
+
"""
|
| 19 |
+
Fix assembly code formatting by ensuring proper tab placement.
|
| 20 |
+
Expected format: address:TABhex_bytesWHITESPACEinstructionWHITESPACEoperands
|
| 21 |
+
"""
|
| 22 |
+
lines = asm_text.split("\n")
|
| 23 |
+
fixed_lines = []
|
| 24 |
+
|
| 25 |
+
for line in lines:
|
| 26 |
+
line = line.rstrip() # Remove trailing whitespace
|
| 27 |
+
if not line.strip(): # Skip empty lines
|
| 28 |
+
fixed_lines.append(line)
|
| 29 |
+
continue
|
| 30 |
+
|
| 31 |
+
# Check if this looks like an assembly instruction line
|
| 32 |
+
# Pattern: optional_spaces + hex_address + colon + hex_bytes + instruction + operands
|
| 33 |
+
asm_pattern = r"^(\s*)([0-9a-f]+):\s*([0-9a-f\s]+?)\s+(\w+)(\s+.*)?$"
|
| 34 |
+
match = re.match(asm_pattern, line, re.IGNORECASE)
|
| 35 |
+
|
| 36 |
+
if match:
|
| 37 |
+
indent, address, hex_bytes, instruction, operands = match.groups()
|
| 38 |
+
operands = operands or ""
|
| 39 |
+
|
| 40 |
+
# Clean up hex bytes (remove extra spaces)
|
| 41 |
+
hex_bytes = re.sub(r"\s+", " ", hex_bytes.strip())
|
| 42 |
+
|
| 43 |
+
# Reconstruct with proper tab formatting
|
| 44 |
+
# Format: indent + address + ":" + TAB + hex_bytes + TAB + instruction + operands
|
| 45 |
+
fixed_line = f"{indent}{address}:\t{hex_bytes}\t{instruction}{operands}"
|
| 46 |
+
fixed_lines.append(fixed_line)
|
| 47 |
+
else:
|
| 48 |
+
# Not an assembly instruction line, keep as is
|
| 49 |
+
fixed_lines.append(line)
|
| 50 |
+
|
| 51 |
+
return "\n".join(fixed_lines)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
print("Downloading model")
|
| 55 |
|
| 56 |
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
| 71 |
def predict(type, input_asm, _c_source):
|
| 72 |
|
| 73 |
if "<func0>:" not in input_asm:
|
| 74 |
+
# Needs normalizing
|
| 75 |
|
| 76 |
# Add a bogus function header if needed.
|
| 77 |
first_line = input_asm.split("\n")[0]
|
|
|
|
| 79 |
print("Adding synthetic function header")
|
| 80 |
input_asm = "<func0>:\n" + input_asm
|
| 81 |
|
| 82 |
+
# Fix tab formatting in assembly code
|
| 83 |
+
input_asm = fix_assembly_tabs(input_asm)
|
| 84 |
+
|
| 85 |
normalized_asm = normalize(input_asm)
|
| 86 |
else:
|
| 87 |
normalized_asm = input_asm
|