Spaces:

Aranwer
/

CodeCloneDetector

Runtime error

App Files Files Community

Aranwer commited on Apr 26, 2025

Commit

84d4d13

verified ·

1 Parent(s): f2bf3d8

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -413

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import streamlit as st
 import javalang
 import torch
@@ -6,241 +7,164 @@ import torch.nn.functional as F
 import re
 import numpy as np
 import networkx as nx
-from transformers import AutoTokenizer, AutoModel
 from torch_geometric.data import Data
 from torch_geometric.nn import GCNConv
 import warnings
 import pandas as pd
 import zipfile
-import os
 from collections import defaultdict
 # Set up page config
 st.set_page_config(
-    page_title="Advanced Java Code Clone Detector (IJaDataset 2.1)",
     page_icon="🔍",
     layout="wide"
 )
-# Suppress warnings
-warnings.filterwarnings("ignore")
-# Constants
-MODEL_NAME = "microsoft/codebert-base"
-MAX_LENGTH = 512
-DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-DATASET_PATH = "archive (1).zip"  # Update this path if needed
-# Initialize models with caching
-@st.cache_resource
 def load_models():
     try:
-        # Load CodeBERT for semantic analysis
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-        code_model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
-        # Initialize RNN model
-        class RNNModel(nn.Module):
-            def __init__(self, input_size, hidden_size, num_layers):
-                super(RNNModel, self).__init__()
-                self.hidden_size = hidden_size
-                self.num_layers = num_layers
-                self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
-                self.fc = nn.Linear(hidden_size, 1)
-            def forward(self, x):
-                h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(DEVICE)
-                out, _ = self.rnn(x, h0)
-                out = self.fc(out[:, -1, :])
-                return out
-        rnn_model = RNNModel(input_size=768, hidden_size=256, num_layers=2).to(DEVICE)
-        # Initialize GNN model
-        class GNNModel(nn.Module):
-            def __init__(self, node_features):
-                super(GNNModel, self).__init__()
-                self.conv1 = GCNConv(node_features, 128)
-                self.conv2 = GCNConv(128, 64)
-                self.fc = nn.Linear(64, 1)
-            def forward(self, data):
-                x, edge_index = data.x, data.edge_index
-                x = F.relu(self.conv1(x, edge_index))
-                x = F.dropout(x, training=self.training)
-                x = self.conv2(x, edge_index)
-                x = self.fc(x)
-                return torch.sigmoid(x.mean())
-        gnn_model = GNNModel(node_features=128).to(DEVICE)
-        return tokenizer, code_model, rnn_model, gnn_model
     except Exception as e:
-        st.error(f"Failed to load models: {str(e)}")
         return None, None, None, None
 @st.cache_resource
 def load_dataset():
     try:
-        # Extract dataset if needed
         if not os.path.exists("Diverse_100K_Dataset"):
             with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
                 zip_ref.extractall(".")
-        # Load sample pairs (modify this based on your dataset structure)
         clone_pairs = []
-        base_path = "Subject_CloneTypes_Directories"
-        # Load pairs from all clone types
         for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST", "Clone_Type4"]:
             type_path = os.path.join(base_path, clone_type)
             if os.path.exists(type_path):
                 for root, _, files in os.walk(type_path):
-                    if files:
-                        # Take first two files as a pair
-                        if len(files) >= 2:
-                            with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1:
-                                code1 = f1.read()
-                            with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
-                                code2 = f2.read()
                             clone_pairs.append({
                                 "type": clone_type,
-                                "code1": code1,
-                                "code2": code2
                             })
-                        break  # Just take one pair per type for demo
-        return clone_pairs[:10]  # Return first 10 pairs for demo
     except Exception as e:
-        st.error(f"Error loading dataset: {str(e)}")
         return []
-tokenizer, code_model, rnn_model, gnn_model = load_models()
-dataset_pairs = load_dataset()
-# AST Processing Functions
 def parse_ast(code):
     try:
-        tokens = javalang.tokenizer.tokenize(code)
-        parser = javalang.parser.Parser(tokens)
-        tree = parser.parse()
-        return tree
-    except Exception as e:
-        st.warning(f"AST parsing error: {str(e)}")
         return None
 def build_ast_graph(ast_tree):
-    if not ast_tree:
-        return None
     G = nx.DiGraph()
     node_id = 0
-    node_map = {}
-    def traverse(node, parent_id=None):
         nonlocal node_id
-        current_id = node_id
-        node_label = str(type(node).__name__)
-        node_map[current_id] = {'type': node_label, 'node': node}
-        G.add_node(current_id, type=node_label)
-        if parent_id is not None:
-            G.add_edge(parent_id, current_id)
         node_id += 1
-        for child in node.children:
             if isinstance(child, javalang.ast.Node):
-                traverse(child, current_id)
             elif isinstance(child, (list, tuple)):
                 for item in child:
                     if isinstance(item, javalang.ast.Node):
-                        traverse(item, current_id)
     traverse(ast_tree)
-    return G, node_map
 def ast_to_pyg_data(ast_graph):
-    if not ast_graph:
-        return None
-    # Convert AST to PyTorch Geometric Data format
-    node_features = []
-    node_types = []
-    for node in ast_graph.nodes():
-        node_type = ast_graph.nodes[node]['type']
-        node_types.append(node_type)
-        # Simple one-hot encoding of node types (in practice, use better encoding)
-        feature = [0] * 50  # Assuming max 50 node types
-        feature[hash(node_type) % 50] = 1
-        node_features.append(feature)
-    # Convert networkx graph to edge_index format
-    edge_index = list(ast_graph.edges())
-    if not edge_index:
-        # Add self-loop if no edges
-        edge_index = [(0, 0)]
-    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
-    x = torch.tensor(node_features, dtype=torch.float)
-    return Data(x=x, edge_index=edge_index)
-# Normalization function
 def normalize_code(code):
-    try:
-        code = re.sub(r'//.*', '', code)  # Remove single-line comments
-        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)  # Multi-line comments
-        code = re.sub(r'\s+', ' ', code).strip()  # Normalize whitespace
-        return code
-    except Exception:
-        return code
-# Feature extraction functions
-def get_lexical_features(code):
-    """Extract lexical features (for Type-1 and Type-2 clones)"""
-    normalized = normalize_code(code)
-    tokens = re.findall(r'\b\w+\b', normalized)
-    return {
-        'token_count': len(tokens),
-        'unique_tokens': len(set(tokens)),
-        'avg_token_length': np.mean([len(t) for t in tokens]) if tokens else 0
-    }
-def get_syntactic_features(ast_tree):
-    """Extract syntactic features (for Type-3 clones)"""
-    if not ast_tree:
-        return {}
-    # Count different node types in AST
-    node_counts = defaultdict(int)
-    def count_nodes(node):
-        node_counts[type(node).__name__] += 1
-        for child in node.children:
-            if isinstance(child, javalang.ast.Node):
-                count_nodes(child)
-            elif isinstance(child, (list, tuple)):
-                for item in child:
-                    if isinstance(item, javalang.ast.Node):
-                        count_nodes(item)
-    count_nodes(ast_tree)
-    return dict(node_counts)
-def get_semantic_features(code):
-    """Extract semantic features (for Type-4 clones)"""
-    embedding = get_embedding(code)
-    return embedding.cpu().numpy().flatten() if embedding is not None else None
-# Embedding generation
-def get_embedding(code):
     try:
-        code = normalize_code(code)
         inputs = tokenizer(
-            code,
             return_tensors="pt",
             truncation=True,
             max_length=MAX_LENGTH,
@@ -248,273 +172,111 @@ def get_embedding(code):
         ).to(DEVICE)
         with torch.no_grad():
-            outputs = code_model(**inputs)
-        return outputs.last_hidden_state.mean(dim=1)  # Pooled embedding
-    except Exception as e:
-        st.error(f"Error processing code: {str(e)}")
-        return None
-# Clone detection models
-def rnn_similarity(emb1, emb2):
-    """Calculate similarity using RNN model"""
-    if emb1 is None or emb2 is None:
         return None
-    # Prepare input for RNN (sequence of embeddings)
-    combined = torch.cat([emb1.unsqueeze(0), emb2.unsqueeze(0)], dim=0)
-    with torch.no_grad():
-        similarity = rnn_model(combined.permute(1, 0, 2))
-    return torch.sigmoid(similarity).item()
-def gnn_similarity(ast1, ast2):
-    """Calculate similarity using GNN model"""
-    if ast1 is None or ast2 is None:
-        return None
-    data1 = ast_to_pyg_data(ast1)
-    data2 = ast_to_pyg_data(ast2)
-    if data1 is None or data2 is None:
-        return None
-    # Move data to device
-    data1 = data1.to(DEVICE)
-    data2 = data2.to(DEVICE)
-    with torch.no_grad():
-        sim1 = gnn_model(data1)
-        sim2 = gnn_model(data2)
-    return F.cosine_similarity(sim1, sim2).item()
-def hybrid_similarity(code1, code2):
-    """Combined similarity score using all models"""
     # Get embeddings
-    emb1 = get_embedding(code1)
-    emb2 = get_embedding(code2)
     # Parse ASTs
-    ast_tree1 = parse_ast(code1)
-    ast_tree2 = parse_ast(code2)
-    ast_graph1 = build_ast_graph(ast_tree1) if ast_tree1 else None
-    ast_graph2 = build_ast_graph(ast_tree2) if ast_tree2 else None
-    # Calculate individual similarities
     codebert_sim = F.cosine_similarity(emb1, emb2).item() if emb1 is not None and emb2 is not None else 0
-    rnn_sim = rnn_similarity(emb1, emb2) if emb1 is not None and emb2 is not None else 0
-    gnn_sim = gnn_similarity(ast_graph1[0] if ast_graph1 else None,
-                            ast_graph2[0] if ast_graph2 else None) or 0
-    # Combine with weights (can be tuned)
-    weights = {
-        'codebert': 0.4,
-        'rnn': 0.3,
-        'gnn': 0.3
-    }
-    combined = (weights['codebert'] * codebert_sim +
-               weights['rnn'] * rnn_sim +
-               weights['gnn'] * gnn_sim)
     return {
-        'combined': combined,
         'codebert': codebert_sim,
         'rnn': rnn_sim,
-        'gnn': gnn_sim
     }
-# Comparison function
-def compare_code(code1, code2):
-    if not code1 or not code2:
-        return None
-    with st.spinner('Analyzing code with multiple techniques...'):
-        # Get lexical features
-        lex1 = get_lexical_features(code1)
-        lex2 = get_lexical_features(code2)
-        # Get AST trees
-        ast_tree1 = parse_ast(code1)
-        ast_tree2 = parse_ast(code2)
-        # Get syntactic features
-        syn1 = get_syntactic_features(ast_tree1)
-        syn2 = get_syntactic_features(ast_tree2)
-        # Get semantic features
-        sem1 = get_semantic_features(code1)
-        sem2 = get_semantic_features(code2)
-        # Calculate hybrid similarity
-        similarities = hybrid_similarity(code1, code2)
-        return {
-            'similarities': similarities,
-            'lexical_features': (lex1, lex2),
-            'syntactic_features': (syn1, syn2),
-            'ast_trees': (ast_tree1, ast_tree2)
-        }
-# UI Elements
-st.title("🔍 Advanced Java Code Clone Detector (IJaDataset 2.1)")
-st.markdown("""
-Detect all types of code clones (Type 1-4) using hybrid approach with:
-- **CodeBERT** for semantic analysis
-- **RNN** for sequence modeling
-- **GNN** for AST structural analysis
-""")
-# Dataset selector
-selected_pair = None
-if dataset_pairs:
-    pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)}
-    selected_option = st.selectbox("Select a preloaded example pair:", list(pair_options.keys()))
-    selected_pair = pair_options[selected_option]
-# Layout
-col1, col2 = st.columns(2)
-with col1:
-    code1 = st.text_area(
-        "First Java Code",
-        height=300,
-        value=selected_pair["code1"] if selected_pair else "",
-        help="Enter the first Java code snippet"
-    )
-with col2:
-    code2 = st.text_area(
-        "Second Java Code",
-        height=300,
-        value=selected_pair["code2"] if selected_pair else "",
-        help="Enter the second Java code snippet"
-    )
-# Threshold sliders
-st.subheader("Detection Thresholds")
-col1, col2, col3 = st.columns(3)
-with col1:
-    threshold_type12 = st.slider(
-        "Type 1/2 Threshold",
-        min_value=0.5,
-        max_value=1.0,
-        value=0.9,
-        step=0.01,
-        help="Threshold for exact/syntactic clones"
-    )
-with col2:
-    threshold_type3 = st.slider(
-        "Type 3 Threshold",
-        min_value=0.5,
-        max_value=1.0,
-        value=0.8,
-        step=0.01,
-        help="Threshold for near-miss clones"
-    )
-with col3:
-    threshold_type4 = st.slider(
-        "Type 4 Threshold",
-        min_value=0.5,
-        max_value=1.0,
-        value=0.7,
-        step=0.01,
-        help="Threshold for semantic clones"
-    )
-# Compare button
-if st.button("Compare Code", type="primary"):
-    if tokenizer is None or code_model is None or rnn_model is None or gnn_model is None:
-        st.error("Models failed to load. Please check the logs.")
-    else:
-        result = compare_code(code1, code2)
-        if result is not None:
-            similarities = result['similarities']
-            lex1, lex2 = result['lexical_features']
-            syn1, syn2 = result['syntactic_features']
-            ast_tree1, ast_tree2 = result['ast_trees']
-            # Display results
-            st.subheader("Detection Results")
             # Determine clone type
-            combined_sim = similarities['combined']
             clone_type = "No Clone"
-            if combined_sim >= threshold_type12:
-                clone_type = "Type 1/2 Clone (Exact/Near-Exact)"
-            elif combined_sim >= threshold_type3:
-                clone_type = "Type 3 Clone (Near-Miss)"
-            elif combined_sim >= threshold_type4:
-                clone_type = "Type 4 Clone (Semantic)"
-            # Main metrics
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Combined Similarity", f"{combined_sim:.3f}")
-            with col2:
-                st.metric("Detected Clone Type", clone_type)
-            with col3:
-                st.metric("CodeBERT Similarity", f"{similarities['codebert']:.3f}")
-            # Detailed metrics
-            with st.expander("Detailed Similarity Scores"):
-                cols = st.columns(3)
-                with cols[0]:
-                    st.metric("RNN Similarity", f"{similarities['rnn']:.3f}")
-                with cols[1]:
-                    st.metric("GNN Similarity", f"{similarities['gnn']:.3f}")
-                with cols[2]:
-                    st.metric("Lexical Similarity",
-                            f"{sum(lex1[k] == lex2[k] for k in lex1)/max(len(lex1),1):.2f}")
-            # Feature comparison
-            with st.expander("Feature Analysis"):
-                st.subheader("Lexical Features")
-                lex_df = pd.DataFrame([lex1, lex2], index=["Code 1", "Code 2"])
-                st.dataframe(lex_df)
-                st.subheader("Syntactic Features (AST Node Counts)")
-                syn_df = pd.DataFrame([syn1, syn2], index=["Code 1", "Code 2"]).fillna(0)
-                st.dataframe(syn_df)
-            # AST Visualization
-            if ast_tree1 and ast_tree2:
-                with st.expander("AST Visualization (First 20 nodes)"):
-                    st.write("AST visualization would be implemented here with graphviz")
-                    # In a real implementation, you would use graphviz to render the ASTs
-                    # st.graphviz_chart(ast_to_graphviz(ast_tree1))
-                    # st.graphviz_chart(ast_to_graphviz(ast_tree2))
-            # Normalized code view
-            with st.expander("Show normalized code"):
-                tab1, tab2 = st.tabs(["First Code", "Second Code"])
-                with tab1:
-                    st.code(normalize_code(code1))
-                with tab2:
-                    st.code(normalize_code(code2))
-# Footer
-st.markdown("---")
-st.markdown("""
-*Dataset Information*:
-- Using IJaDataset 2.1 from Kaggle
-- Contains 100K Java files with clone annotations
-- Clone types: Type-1, Type-2, Type-3, and Type-4 clones
-*Model Architecture*:
-- **CodeBERT**: Pre-trained model for semantic analysis
-- **RNN**: Processes token sequences for sequential patterns
-- **GNN**: Analyzes AST structure for syntactic patterns
-- **Hybrid Approach**: Combines all techniques for comprehensive detection
-""")

+import os
 import streamlit as st
 import javalang
 import torch
 import re
 import numpy as np
 import networkx as nx
+from transformers import AutoTokenizer, AutoModel, AutoConfig
 from torch_geometric.data import Data
 from torch_geometric.nn import GCNConv
 import warnings
 import pandas as pd
 import zipfile
 from collections import defaultdict
+# Configuration
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+warnings.filterwarnings("ignore")
+# Constants
+MODEL_NAME = "microsoft/codebert-base"
+MAX_LENGTH = 512
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+DATASET_PATH = "ijadataset2-1.zip"
+CACHE_DIR = "./model_cache"
 # Set up page config
 st.set_page_config(
+    page_title="Advanced Java Code Clone Detector",
     page_icon="🔍",
     layout="wide"
 )
+# Model Definitions
+class RNNModel(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
+        self.fc = nn.Linear(hidden_size, 1)
+    def forward(self, x):
+        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(DEVICE)
+        out, _ = self.rnn(x, h0)
+        return self.fc(out[:, -1, :])
+class GNNModel(nn.Module):
+    def __init__(self, node_features):
+        super().__init__()
+        self.conv1 = GCNConv(node_features, 128)
+        self.conv2 = GCNConv(128, 64)
+        self.fc = nn.Linear(64, 1)
+    def forward(self, data):
+        x, edge_index = data.x, data.edge_index
+        x = F.relu(self.conv1(x, edge_index))
+        x = F.dropout(x, training=self.training)
+        x = self.conv2(x, edge_index)
+        return torch.sigmoid(self.fc(x).mean())
+# Model Loading with Cache
+@st.cache_resource(show_spinner=False)
 def load_models():
     try:
+        with st.spinner('Loading models (first run may take a few minutes)...'):
+            config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
+            tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
+            model = AutoModel.from_pretrained(MODEL_NAME, config=config, cache_dir=CACHE_DIR).to(DEVICE)
+            rnn_model = RNNModel(input_size=768, hidden_size=256, num_layers=2).to(DEVICE)
+            gnn_model = GNNModel(node_features=128).to(DEVICE)
+            return tokenizer, model, rnn_model, gnn_model
     except Exception as e:
+        st.error(f"Model loading failed: {str(e)}")
         return None, None, None, None
+# Dataset Loading
 @st.cache_resource
 def load_dataset():
     try:
         if not os.path.exists("Diverse_100K_Dataset"):
             with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
                 zip_ref.extractall(".")
         clone_pairs = []
+        base_path = "Diverse_100K_Dataset/Subject_CloneTypes_Directories"
         for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST", "Clone_Type4"]:
             type_path = os.path.join(base_path, clone_type)
             if os.path.exists(type_path):
                 for root, _, files in os.walk(type_path):
+                    if files and len(files) >= 2:
+                        with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1, \
+                             open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
                             clone_pairs.append({
                                 "type": clone_type,
+                                "code1": f1.read(),
+                                "code2": f2.read()
                             })
+                        break
+        return clone_pairs[:10]
     except Exception as e:
+        st.error(f"Dataset error: {str(e)}")
         return []
+# AST Processing
 def parse_ast(code):
     try:
+        return javalang.parse.parse(code)
+    except:
         return None
 def build_ast_graph(ast_tree):
+    if not ast_tree: return None
     G = nx.DiGraph()
     node_id = 0
+    def traverse(node, parent=None):
         nonlocal node_id
+        current = node_id
+        G.add_node(current, type=type(node).__name__)
+        if parent is not None:
+            G.add_edge(parent, current)
         node_id += 1
+        for child in getattr(node, 'children', []):
             if isinstance(child, javalang.ast.Node):
+                traverse(child, current)
             elif isinstance(child, (list, tuple)):
                 for item in child:
                     if isinstance(item, javalang.ast.Node):
+                        traverse(item, current)
     traverse(ast_tree)
+    return G
 def ast_to_pyg_data(ast_graph):
+    if not ast_graph: return None
+    node_types = list(nx.get_node_attributes(ast_graph, 'type').values())
+    unique_types = list(set(node_types))
+    type_to_idx = {t: i for i, t in enumerate(unique_types)}
+    x = torch.zeros(len(node_types), len(unique_types))
+    for i, t in enumerate(node_types):
+        x[i, type_to_idx[t]] = 1
+    edge_index = torch.tensor(list(ast_graph.edges())).t().contiguous()
+    return Data(x=x.to(DEVICE), edge_index=edge_index.to(DEVICE))
+# Feature Extraction
 def normalize_code(code):
+    code = re.sub(r'//.*?$', '', code, flags=re.MULTILINE)
+    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
+    return re.sub(r'\s+', ' ', code).strip()
+def get_embedding(code, tokenizer, model):
     try:
         inputs = tokenizer(
+            normalize_code(code),
             return_tensors="pt",
             truncation=True,
             max_length=MAX_LENGTH,
         ).to(DEVICE)
         with torch.no_grad():
+            return model(**inputs).last_hidden_state.mean(dim=1)
+    except:
         return None
+# Similarity Calculations
+def calculate_similarities(code1, code2, models):
+    tokenizer, code_model, rnn_model, gnn_model = models
     # Get embeddings
+    emb1 = get_embedding(code1, tokenizer, code_model)
+    emb2 = get_embedding(code2, tokenizer, code_model)
     # Parse ASTs
+    ast1 = build_ast_graph(parse_ast(code1))
+    ast2 = build_ast_graph(parse_ast(code2))
+    # Calculate similarities
     codebert_sim = F.cosine_similarity(emb1, emb2).item() if emb1 is not None and emb2 is not None else 0
+    rnn_sim = 0
+    if emb1 is not None and emb2 is not None:
+        with torch.no_grad():
+            rnn_input = torch.stack([emb1.squeeze(), emb2.squeeze()])
+            rnn_sim = torch.sigmoid(rnn_model(rnn_input.unsqueeze(0))).item()
+    gnn_sim = 0
+    if ast1 and ast2:
+        data1 = ast_to_pyg_data(ast1)
+        data2 = ast_to_pyg_data(ast2)
+        if data1 and data2:
+            with torch.no_grad():
+                gnn_sim = F.cosine_similarity(
+                    gnn_model(data1).unsqueeze(0),
+                    gnn_model(data2).unsqueeze(0)
+                ).item()
     return {
         'codebert': codebert_sim,
         'rnn': rnn_sim,
+        'gnn': gnn_sim,
+        'combined': 0.4*codebert_sim + 0.3*rnn_sim + 0.3*gnn_sim
     }
+# UI Components
+def main():
+    st.title("🔍 Advanced Java Code Clone Detector")
+    st.markdown("Detect all clone types (1-4) using hybrid analysis")
+    # Load resources
+    models = load_models()
+    dataset_pairs = load_dataset()
+    # Code input
+    selected_pair = None
+    if dataset_pairs:
+        pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)}
+        selected_option = st.selectbox("Select example pair:", list(pair_options.keys()))
+        selected_pair = pair_options[selected_option]
+    col1, col2 = st.columns(2)
+    with col1:
+        code1 = st.text_area("Code 1", height=300, value=selected_pair["code1"] if selected_pair else "")
+    with col2:
+        code2 = st.text_area("Code 2", height=300, value=selected_pair["code2"] if selected_pair else "")
+    # Thresholds
+    st.subheader("Detection Thresholds")
+    cols = st.columns(3)
+    with cols[0]:
+        t1 = st.slider("Type 1/2", 0.85, 1.0, 0.95)
+    with cols[1]:
+        t3 = st.slider("Type 3", 0.7, 0.9, 0.8)
+    with cols[2]:
+        t4 = st.slider("Type 4", 0.5, 0.8, 0.65)
+    # Analysis
+    if st.button("Analyze", type="primary") and models[0]:
+        with st.spinner("Analyzing..."):
+            sims = calculate_similarities(code1, code2, models)
             # Determine clone type
             clone_type = "No Clone"
+            if sims['combined'] >= t1:
+                clone_type = "Type 1/2 Clone"
+            elif sims['combined'] >= t3:
+                clone_type = "Type 3 Clone"
+            elif sims['combined'] >= t4:
+                clone_type = "Type 4 Clone"
+            # Display results
+            st.subheader("Results")
+            cols = st.columns(4)
+            cols[0].metric("Combined", f"{sims['combined']:.2f}")
+            cols[1].metric("CodeBERT", f"{sims['codebert']:.2f}")
+            cols[2].metric("RNN", f"{sims['rnn']:.2f}")
+            cols[3].metric("GNN", f"{sims['gnn']:.2f}")
+            st.progress(sims['combined'])
+            st.metric("Detection Result", clone_type)
+            # Show details
+            with st.expander("Details"):
+                st.json(sims)
+                st.code(f"Normalized Code 1:\n{normalize_code(code1)}")
+                st.code(f"Normalized Code 2:\n{normalize_code(code2)}")
+if __name__ == "__main__":
+    main()