Spaces:

Aranwer
/

CodeCloneDetector

Runtime error

App Files Files Community

Aranwer commited on Apr 26, 2025

Commit

daf072c

verified ·

1 Parent(s): 84d4d13

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -139

app.py CHANGED Viewed

@@ -7,12 +7,9 @@ import torch.nn.functional as F
 import re
 import numpy as np
 import networkx as nx
-from transformers import AutoTokenizer, AutoModel, AutoConfig
-from torch_geometric.data import Data
-from torch_geometric.nn import GCNConv
 import warnings
 import pandas as pd
-import zipfile
 from collections import defaultdict
 # Configuration
@@ -23,137 +20,66 @@ warnings.filterwarnings("ignore")
 MODEL_NAME = "microsoft/codebert-base"
 MAX_LENGTH = 512
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-DATASET_PATH = "ijadataset2-1.zip"
-CACHE_DIR = "./model_cache"
 # Set up page config
 st.set_page_config(
-    page_title="Advanced Java Code Clone Detector",
     page_icon="🔍",
     layout="wide"
 )
-# Model Definitions
-class RNNModel(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers):
         super().__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
         self.fc = nn.Linear(hidden_size, 1)
     def forward(self, x):
-        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(DEVICE)
-        out, _ = self.rnn(x, h0)
-        return self.fc(out[:, -1, :])
-class GNNModel(nn.Module):
-    def __init__(self, node_features):
-        super().__init__()
-        self.conv1 = GCNConv(node_features, 128)
-        self.conv2 = GCNConv(128, 64)
-        self.fc = nn.Linear(64, 1)
-    def forward(self, data):
-        x, edge_index = data.x, data.edge_index
-        x = F.relu(self.conv1(x, edge_index))
-        x = F.dropout(x, training=self.training)
-        x = self.conv2(x, edge_index)
-        return torch.sigmoid(self.fc(x).mean())
-# Model Loading with Cache
 @st.cache_resource(show_spinner=False)
 def load_models():
     try:
         with st.spinner('Loading models (first run may take a few minutes)...'):
-            config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
-            tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
-            model = AutoModel.from_pretrained(MODEL_NAME, config=config, cache_dir=CACHE_DIR).to(DEVICE)
-            rnn_model = RNNModel(input_size=768, hidden_size=256, num_layers=2).to(DEVICE)
-            gnn_model = GNNModel(node_features=128).to(DEVICE)
-            return tokenizer, model, rnn_model, gnn_model
     except Exception as e:
         st.error(f"Model loading failed: {str(e)}")
-        return None, None, None, None
-# Dataset Loading
-@st.cache_resource
-def load_dataset():
-    try:
-        if not os.path.exists("Diverse_100K_Dataset"):
-            with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
-                zip_ref.extractall(".")
-        clone_pairs = []
-        base_path = "Diverse_100K_Dataset/Subject_CloneTypes_Directories"
-        for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST", "Clone_Type4"]:
-            type_path = os.path.join(base_path, clone_type)
-            if os.path.exists(type_path):
-                for root, _, files in os.walk(type_path):
-                    if files and len(files) >= 2:
-                        with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1, \
-                             open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
-                            clone_pairs.append({
-                                "type": clone_type,
-                                "code1": f1.read(),
-                                "code2": f2.read()
-                            })
-                        break
-        return clone_pairs[:10]
-    except Exception as e:
-        st.error(f"Dataset error: {str(e)}")
-        return []
-# AST Processing
 def parse_ast(code):
     try:
         return javalang.parse.parse(code)
     except:
         return None
-def build_ast_graph(ast_tree):
-    if not ast_tree: return None
-    G = nx.DiGraph()
-    node_id = 0
-    def traverse(node, parent=None):
-        nonlocal node_id
-        current = node_id
-        G.add_node(current, type=type(node).__name__)
-        if parent is not None:
-            G.add_edge(parent, current)
-        node_id += 1
         for child in getattr(node, 'children', []):
             if isinstance(child, javalang.ast.Node):
-                traverse(child, current)
             elif isinstance(child, (list, tuple)):
                 for item in child:
                     if isinstance(item, javalang.ast.Node):
-                        traverse(item, current)
     traverse(ast_tree)
-    return G
-def ast_to_pyg_data(ast_graph):
-    if not ast_graph: return None
-    node_types = list(nx.get_node_attributes(ast_graph, 'type').values())
-    unique_types = list(set(node_types))
-    type_to_idx = {t: i for i, t in enumerate(unique_types)}
-    x = torch.zeros(len(node_types), len(unique_types))
-    for i, t in enumerate(node_types):
-        x[i, type_to_idx[t]] = 1
-    edge_index = torch.tensor(list(ast_graph.edges())).t().contiguous()
-    return Data(x=x.to(DEVICE), edge_index=edge_index.to(DEVICE))
 # Feature Extraction
 def normalize_code(code):
@@ -176,66 +102,88 @@ def get_embedding(code, tokenizer, model):
     except:
         return None
-# Similarity Calculations
 def calculate_similarities(code1, code2, models):
-    tokenizer, code_model, rnn_model, gnn_model = models
     # Get embeddings
     emb1 = get_embedding(code1, tokenizer, code_model)
     emb2 = get_embedding(code2, tokenizer, code_model)
-    # Parse ASTs
-    ast1 = build_ast_graph(parse_ast(code1))
-    ast2 = build_ast_graph(parse_ast(code2))
     # Calculate similarities
-    codebert_sim = F.cosine_similarity(emb1, emb2).item() if emb1 is not None and emb2 is not None else 0
     rnn_sim = 0
     if emb1 is not None and emb2 is not None:
         with torch.no_grad():
-            rnn_input = torch.stack([emb1.squeeze(), emb2.squeeze()])
-            rnn_sim = torch.sigmoid(rnn_model(rnn_input.unsqueeze(0))).item()
-    gnn_sim = 0
-    if ast1 and ast2:
-        data1 = ast_to_pyg_data(ast1)
-        data2 = ast_to_pyg_data(ast2)
-        if data1 and data2:
-            with torch.no_grad():
-                gnn_sim = F.cosine_similarity(
-                    gnn_model(data1).unsqueeze(0),
-                    gnn_model(data2).unsqueeze(0)
-                ).item()
     return {
         'codebert': codebert_sim,
         'rnn': rnn_sim,
-        'gnn': gnn_sim,
-        'combined': 0.4*codebert_sim + 0.3*rnn_sim + 0.3*gnn_sim
     }
-# UI Components
 def main():
-    st.title("🔍 Advanced Java Code Clone Detector")
-    st.markdown("Detect all clone types (1-4) using hybrid analysis")
-    # Load resources
     models = load_models()
-    dataset_pairs = load_dataset()
     # Code input
-    selected_pair = None
-    if dataset_pairs:
-        pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)}
-        selected_option = st.selectbox("Select example pair:", list(pair_options.keys()))
-        selected_pair = pair_options[selected_option]
     col1, col2 = st.columns(2)
     with col1:
-        code1 = st.text_area("Code 1", height=300, value=selected_pair["code1"] if selected_pair else "")
     with col2:
-        code2 = st.text_area("Code 2", height=300, value=selected_pair["code2"] if selected_pair else "")
     # Thresholds
     st.subheader("Detection Thresholds")
@@ -247,33 +195,38 @@ def main():
     with cols[2]:
         t4 = st.slider("Type 4", 0.5, 0.8, 0.65)
-    # Analysis
-    if st.button("Analyze", type="primary") and models[0]:
-        with st.spinner("Analyzing..."):
             sims = calculate_similarities(code1, code2, models)
             # Determine clone type
             clone_type = "No Clone"
             if sims['combined'] >= t1:
-                clone_type = "Type 1/2 Clone"
             elif sims['combined'] >= t3:
-                clone_type = "Type 3 Clone"
             elif sims['combined'] >= t4:
-                clone_type = "Type 4 Clone"
             # Display results
             st.subheader("Results")
             cols = st.columns(4)
             cols[0].metric("Combined", f"{sims['combined']:.2f}")
             cols[1].metric("CodeBERT", f"{sims['codebert']:.2f}")
             cols[2].metric("RNN", f"{sims['rnn']:.2f}")
-            cols[3].metric("GNN", f"{sims['gnn']:.2f}")
             st.progress(sims['combined'])
             st.metric("Detection Result", clone_type)
             # Show details
-            with st.expander("Details"):
                 st.json(sims)
                 st.code(f"Normalized Code 1:\n{normalize_code(code1)}")
                 st.code(f"Normalized Code 2:\n{normalize_code(code2)}")

 import re
 import numpy as np
 import networkx as nx
+from transformers import AutoTokenizer, AutoModel
 import warnings
 import pandas as pd
 from collections import defaultdict
 # Configuration
 MODEL_NAME = "microsoft/codebert-base"
 MAX_LENGTH = 512
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 # Set up page config
 st.set_page_config(
+    page_title="Java Code Clone Detector",
     page_icon="🔍",
     layout="wide"
 )
+# Simplified RNN Model (for Hugging Face compatibility)
+class SimpleRNN(nn.Module):
+    def __init__(self, input_size=768, hidden_size=128):
         super().__init__()
+        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
         self.fc = nn.Linear(hidden_size, 1)
     def forward(self, x):
+        out, _ = self.rnn(x)
+        return torch.sigmoid(self.fc(out[:, -1]))
+# Model Loading with caching
 @st.cache_resource(show_spinner=False)
 def load_models():
     try:
         with st.spinner('Loading models (first run may take a few minutes)...'):
+            # Load CodeBERT
+            tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+            code_model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
+            # Initialize simple RNN
+            rnn_model = SimpleRNN().to(DEVICE)
+            return tokenizer, code_model, rnn_model
     except Exception as e:
         st.error(f"Model loading failed: {str(e)}")
+        return None, None, None
+# AST Processing (simplified for Hugging Face)
 def parse_ast(code):
     try:
         return javalang.parse.parse(code)
     except:
         return None
+def build_simple_ast_features(ast_tree):
+    if not ast_tree: return {}
+    features = defaultdict(int)
+    def traverse(node):
+        features[type(node).__name__] += 1
         for child in getattr(node, 'children', []):
             if isinstance(child, javalang.ast.Node):
+                traverse(child)
             elif isinstance(child, (list, tuple)):
                 for item in child:
                     if isinstance(item, javalang.ast.Node):
+                        traverse(item)
     traverse(ast_tree)
+    return dict(features)
 # Feature Extraction
 def normalize_code(code):
     except:
         return None
+# Similarity Calculations (optimized for Hugging Face)
 def calculate_similarities(code1, code2, models):
+    tokenizer, code_model, rnn_model = models
     # Get embeddings
     emb1 = get_embedding(code1, tokenizer, code_model)
     emb2 = get_embedding(code2, tokenizer, code_model)
+    # Get AST features
+    ast1 = parse_ast(code1)
+    ast2 = parse_ast(code2)
+    ast_features1 = build_simple_ast_features(ast1)
+    ast_features2 = build_simple_ast_features(ast2)
     # Calculate similarities
+    codebert_sim = 0
+    if emb1 is not None and emb2 is not None:
+        codebert_sim = F.cosine_similarity(emb1, emb2).item()
     rnn_sim = 0
     if emb1 is not None and emb2 is not None:
         with torch.no_grad():
+            rnn_input = torch.cat([emb1, emb2]).unsqueeze(0)
+            rnn_sim = rnn_model(rnn_input).item()
+    # Simple AST similarity (count matching node types)
+    ast_sim = 0
+    if ast_features1 and ast_features2:
+        common_keys = set(ast_features1.keys()) & set(ast_features2.keys())
+        total_keys = set(ast_features1.keys()) | set(ast_features2.keys())
+        ast_sim = len(common_keys) / len(total_keys) if total_keys else 0
     return {
         'codebert': codebert_sim,
         'rnn': rnn_sim,
+        'ast': ast_sim,
+        'combined': 0.5*codebert_sim + 0.3*rnn_sim + 0.2*ast_sim
     }
+# Main UI
 def main():
+    st.title("🔍 Java Code Clone Detector (IJaDataset 2.1)")
+    st.markdown("Detect Type 1-4 clones using hybrid analysis")
+    # Load models
     models = load_models()
+    if None in models:
+        st.error("Failed to load required models. Please check the logs.")
+        return
+    # Example code pairs
+    example_pairs = {
+        "Type 1 Example": {
+            "code1": "public class Test { public static void main(String[] args) { System.out.println(\"Hello\"); } }",
+            "code2": "public class Test { public static void main(String[] args) { System.out.println(\"Hello\"); } }"
+        },
+        "Type 2 Example": {
+            "code1": "public class Test { public static void main(String[] args) { System.out.println(\"Hello\"); } }",
+            "code2": "public class Example { public static void main(String[] args) { System.out.println(\"Hello\"); } }"
+        },
+        "Type 3 Example": {
+            "code1": "public class Test { public static void main(String[] args) { for(int i=0;i<10;i++) System.out.println(i); } }",
+            "code2": "public class Example { public static void run(String[] params) { for(int j=0;j<10;j++) System.out.println(j); } }"
+        }
+    }
     # Code input
+    selected_example = st.selectbox("Select example pair:", list(example_pairs.keys()))
     col1, col2 = st.columns(2)
     with col1:
+        code1 = st.text_area(
+            "Code 1",
+            height=300,
+            value=example_pairs[selected_example]["code1"]
+        )
     with col2:
+        code2 = st.text_area(
+            "Code 2",
+            height=300,
+            value=example_pairs[selected_example]["code2"]
+        )
     # Thresholds
     st.subheader("Detection Thresholds")
     with cols[2]:
         t4 = st.slider("Type 4", 0.5, 0.8, 0.65)
+    # Analysis button
+    if st.button("Analyze Code", type="primary"):
+        with st.spinner("Analyzing code..."):
             sims = calculate_similarities(code1, code2, models)
             # Determine clone type
             clone_type = "No Clone"
             if sims['combined'] >= t1:
+                clone_type = "Type 1/2 Clone (Exact/Near-Exact)"
             elif sims['combined'] >= t3:
+                clone_type = "Type 3 Clone (Near-Miss)"
             elif sims['combined'] >= t4:
+                clone_type = "Type 4 Clone (Semantic)"
             # Display results
             st.subheader("Results")
+            # Metrics
             cols = st.columns(4)
             cols[0].metric("Combined", f"{sims['combined']:.2f}")
             cols[1].metric("CodeBERT", f"{sims['codebert']:.2f}")
             cols[2].metric("RNN", f"{sims['rnn']:.2f}")
+            cols[3].metric("AST", f"{sims['ast']:.2f}")
+            # Progress bar
             st.progress(sims['combined'])
+            # Final result
             st.metric("Detection Result", clone_type)
             # Show details
+            with st.expander("Advanced Details"):
                 st.json(sims)
                 st.code(f"Normalized Code 1:\n{normalize_code(code1)}")
                 st.code(f"Normalized Code 2:\n{normalize_code(code2)}")