Spaces:

OSS-forge
/

BugGen

Running

File size: 10,863 Bytes

be20041
 
 
 
 
 
ec5045e
d6272b6
 
ec5045e
 
 
 
be20041
 
 
 
 
 
ec5045e
 
 
be20041
 
ec5045e
be20041
 
ec5045e
be20041
ec5045e
be20041
 
 
 
ec5045e
 
 
 
be20041
 
 
 
ec5045e
 
 
be20041
 
 
 
 
 
 
ec5045e
be20041
 
ec5045e
be20041
ec5045e
be20041
 
ec5045e
be20041
 
ec5045e
be20041
 
 
 
 
 
 
 
ec5045e
be20041
 
 
 
 
 
ec5045e
be20041
ec5045e
 
 
 
be20041
ec5045e
be20041
 
ec5045e
be20041
 
 
ec5045e
 
 
be20041
 
 
 
 
ec5045e
be20041
 
 
 
ec5045e
be20041
ec5045e
be20041
 
ec5045e
 
 
 
 
 
 
 
 
 
 
 
be20041
ec5045e
 
 
be20041
 
 
 
ec5045e
be20041
 
 
 
ec5045e
 
be20041
ec5045e
be20041
ec5045e
be20041
ec5045e
 
 
 
 
 
 
be20041
 
ec5045e
be20041
 
ec5045e
 
 
be20041
 
ec5045e
be20041
ec5045e
be20041
 
 
 
ec5045e
 
be20041
ec5045e
be20041
 
 
 
 
 
 
 
ec5045e
be20041
 
ec5045e
be20041
 
 
 
ec5045e
 
 
 
 
d6272b6
fbf65ef
d6272b6
b761fa3
fbf65ef
 
b761fa3
 
 
d6272b6
 
fbf65ef
d6272b6
 
 
 
 
fbf65ef
d6272b6
 
 
 
 
fbf65ef
d6272b6
 
23479ca
ec5045e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8439b6b
 
 
ec5045e
 
 
8439b6b
ec5045e
 
 
 
 
 
be20041
ec5045e
 
 
 
 
 
 
 
 
 
9540433
 
 
 
 
 
ec5045e
9540433
 
 
 
 
 
 
 
 
 
 
 
 
 
ec5045e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8439b6b
 
 
 
ec5045e
 
be20041
ec5045e
9540433
8439b6b
ec5045e
 
be20041
 
ec5045e
be20041
 
ec5045e
 
 
 
 
 
 
 
 
 
 
be20041
 
ec5045e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be20041
 
ec5045e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be20041
 
ec5045e
 
 
 
 
 
 
 
 
 
be20041
ec5045e
2456544

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import gradio as gr
import torch
import autopep8
import glob
import re
import os
from huggingface_hub import hf_hub_download


# ==========================
#  Utility functions
# ==========================

def normalize_indentation(code):
    """
    Normalize indentation in example code by removing excessive tabs.
    Also removes any backslash characters.
    """
    code = code.replace("\\", "")

    lines = code.split("\n")
    if not lines:
        return ""

    fixed_lines = []
    indent_fix_mode = False

    for i, line in enumerate(lines):
        if line.strip().startswith("def "):
            fixed_lines.append(line)
            indent_fix_mode = True
        elif indent_fix_mode and line.strip():
            # For indented lines in a function
            if line.startswith("\t\t"):  # Two tabs
                fixed_lines.append("\t" + line[2:])  # Replace with one tab
            elif line.startswith("        "):  # 8 spaces (2 levels)
                fixed_lines.append("    " + line[8:])  # Replace with 4 spaces
            else:
                fixed_lines.append(line)
        else:
            fixed_lines.append(line)

    return "\n".join(fixed_lines)


def clear_text(text):
    """
    Cleans text from escape sequences while preserving original formatting.
    """
    temp_newline = "TEMP_NEWLINE_PLACEHOLDER"
    temp_tab = "TEMP_TAB_PLACEHOLDER"

    text = text.replace("\\n", temp_newline)
    text = text.replace("\\t", temp_tab)

    text = text.replace("\\", "")

    text = text.replace(temp_newline, "\n")
    text = text.replace(temp_tab, "\t")

    return text


def encode_text(text):
    """
    Encodes control characters into escape sequences.
    """
    text = text.replace("\n", "\\n")
    text = text.replace("\t", "\\t")
    return text


def format_code(code):
    """
    Format Python code using autopep8 with aggressive settings.
    """
    try:
        formatted_code = autopep8.fix_code(
            code,
            options={
                "aggressive": 2,
                "max_line_length": 88,
                "indent_size": 4,
            },
        )

        # Additional formatting for consistent spacing around parentheses and operators
        formatted_code = formatted_code.replace("( ", "(").replace(" )", ")")

        for op in ["+", "-", "*", "/", "=", "==", "!=", ">=", "<=", ">", "<"]:
            formatted_code = formatted_code.replace(f"{op} ", op + " ")
            formatted_code = formatted_code.replace(f" {op}", " " + op)

        formatted_code = re.sub(r"(\w+)\s+\(", r"\1(", formatted_code)

        return formatted_code
    except Exception as e:
        print(f"Error formatting code: {str(e)}")
        return code


def fix_common_syntax_issues(code):
    """
    Fix common syntax issues in generated code without modifying indentation.
    """
    lines = code.split("\n")
    fixed_lines = []

    for line in lines:
        stripped = line.strip()
        if (
            stripped.startswith("if ")
            or stripped.startswith("elif ")
            or stripped.startswith("else")
            or stripped.startswith("for ")
            or stripped.startswith("while ")
            or stripped.startswith("def ")
            or stripped.startswith("class ")
        ):
            if not stripped.endswith(":") and not stripped.endswith("\\"):
                line = line.rstrip() + ":"

        fixed_lines.append(line)

    code = "\n".join(fixed_lines)

    # Fix mismatched quotes
    quote_chars = ['"', "'"]
    for quote in quote_chars:
        if code.count(quote) % 2 != 0:
            lines = code.split("\n")
            for i, line in enumerate(lines):
                if line.count(quote) % 2 != 0:
                    lines[i] = line.rstrip() + quote
                    break
            code = "\n".join(lines)

    # Fix missing parentheses in function calls
    pattern = r"(\w+)\s*\([^)]*$"
    if re.search(pattern, code):
        lines = code.split("\n")
        for i, line in enumerate(lines):
            if re.search(pattern, line) and not any(
                lines[j].strip().startswith(")")
                for j in range(i + 1, min(i + 3, len(lines)))
            ):
                lines[i] = line.rstrip() + ")"
        code = "\n".join(lines)

    return code


def load_example_from_file(example_path):
    """
    Load example from a file with format:
    description_BREAK_code
    where 'code' uses \\n and \\t for formatting.
    """
    try:
        with open(example_path, "r") as f:
            content = f.read()

        parts = content.split("_BREAK_")
        if len(parts) == 2:
            description = parts[0].strip()
            code = parts[1].strip()

            code = code.replace("\\n", "\n").replace("\\t", "\t")
            code = normalize_indentation(code)

            return description, code
        else:
            print(f"Invalid format in example file: {example_path}")
            return "", ""
    except Exception as e:
        print(f"Error loading example file {example_path}: {str(e)}")
        return "", ""


def find_example_files():
    """
    Find all raw.in example files in the examples directory.
    """
    example_files = glob.glob("examples/*/raw.in")
    return example_files


# ==========================
#  Load model from HF Hub
# ==========================

BASE_MODEL_ID = "Salesforce/codet5p-770m"
FINETUNED_REPO_ID = "OSS-forge/codet5p-770m-pyresbugs"  
FINETUNED_FILENAME = "pytorch_model.bin"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Loading tokenizer from base model: {BASE_MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

print(f"Loading base model: {BASE_MODEL_ID}")
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_ID)
model.to(device)

print(f"Downloading fine-tuned weights from repo: {FINETUNED_REPO_ID}")
ckpt_path = hf_hub_download(FINETUNED_REPO_ID, FINETUNED_FILENAME)

print(f"Loading state_dict from: {ckpt_path}")
state_dict = torch.load(ckpt_path, map_location="cpu")

if "model_state_dict" in state_dict:
    state_dict = state_dict["model_state_dict"]

missing, unexpected = model.load_state_dict(state_dict, strict=False)
print(f"Loaded fine-tuned weights. Missing keys: {len(missing)}, unexpected keys: {len(unexpected)}")

model.eval()




# ==========================
#  Gradio logic
# ==========================

# State variables
current_code = None
bug_counter = 0


def generate_bugged_code(description, code, chat_history, is_first_time):
    global current_code, bug_counter

    if chat_history is None:
        chat_history = []

    if is_first_time:
        bug_counter = 0
        current_code = None
        chat_history = []

    bug_counter += 1

    if bug_counter == 1:
        input_for_model = code
        input_type = "original"
    else:
        if current_code is None:
            return chat_history, gr.update(value=""), False
        input_for_model = current_code
        input_type = "previous bugged code"

    print(f"Using {input_type} - counter: {bug_counter}\n{input_for_model}")

    encoded_code = encode_text(input_for_model)
    combined_input = f"Description: {description} _BREAK_ Code: {encoded_code}"

    inputs = tokenizer(
        combined_input,
        return_tensors="pt",
        truncation=True,
        max_length=512,
    ).input_ids.to(device)

    try:
        print("Starting generation...")
        with torch.no_grad():
            outputs = model.generate(
                inputs,
                max_new_tokens=256,
                num_beams=1,
                do_sample=False,
                early_stopping=True,
            )
        print("Generation done.")
    except Exception as e:
        print("Generation error:", repr(e))
        raise e

    bugged_code_escaped = tokenizer.decode(outputs[0], skip_special_tokens=True)

    bugged_code = clear_text(bugged_code_escaped)
    bugged_code = fix_common_syntax_issues(bugged_code)
    bugged_code = format_code(bugged_code)

    current_code = bugged_code

    user_message = f"**Description**: {description}"
    if input_type == "original":
        user_message += f"\n\n**Original code**:\n```python\n{input_for_model}\n```"
    else:
        user_message += (
            f"\n\n**Previous bugged code**:\n```python\n{input_for_model}\n```"
        )

    ai_message = f"**Bugged code**:\n```python\n{bugged_code}\n```"

    chat_history = chat_history + [
        {"role": "user", "content": user_message},
        {"role": "assistant", "content": ai_message},
    ]

    return chat_history, gr.update(value=""), False




def reset_interface():
    global current_code, bug_counter
    current_code = None
    bug_counter = 0
    return [], gr.update(value=""), True


example_files = find_example_files()
example_names = [
    f"Example {i+1}: {os.path.basename(os.path.dirname(f))}"
    for i, f in enumerate(example_files)
]


def load_example(example_index):
    if example_index < len(example_files):
        return load_example_from_file(example_files[example_index])
    return "", ""


with gr.Blocks(title="Software-Fault Injection from NL") as demo:
    gr.Markdown("# 🐞 Software-Fault Injection from Natural Language")
    gr.Markdown(
        "Generate Python code with specific bugs based on a description and original code. "
        "The model used is **BugGen (CodeT5+ 770M, PyResBugs)**."
    )

    with gr.Row():
        with gr.Column(scale=2):
            description_input = gr.Textbox(
                label="Bug Description",
                placeholder="Describe the type of bug to introduce...",
                lines=3,
            )
            code_input = gr.Code(
                label="Original Code",
                language="python",
                lines=12,
            )

            is_first = gr.State(True)

            submit_btn = gr.Button("Generate Bugged Code")
            reset_btn = gr.Button("Start Over")

            gr.Markdown("### Examples")
            example_buttons = [gr.Button(name) for name in example_names]

        with gr.Column(scale=3):
            chat_output = gr.Chatbot(
                label="Conversation",
                height=500,
            )

    for i, btn in enumerate(example_buttons):
        btn.click(
            fn=lambda i=i: load_example(i),
            outputs=[description_input, code_input],
        )

    submit_btn.click(
        fn=generate_bugged_code,
        inputs=[description_input, code_input, chat_output, is_first],
        outputs=[chat_output, description_input, is_first],
    )

    reset_btn.click(
        fn=reset_interface,
        outputs=[chat_output, description_input, is_first],
    )

print("Launching Gradio interface...")
demo.queue(max_size=10).launch()