Spaces:

raghuram00
/

code-complexity-predictor

No application file

App Files Files Community

raghuram00 commited on Mar 25

Commit

24be017

0 Parent(s):

Initial commit

Browse files

Files changed (14) hide show

.gitattributes +35 -0
.gitignore +9 -0
Dockerfile +20 -0
README.md +13 -0
Untitled2.ipynb +0 -0
backend/main.py +84 -0
download_model.py +18 -0
frontend/index.html +99 -0
frontend/script.js +91 -0
frontend/styles.css +333 -0
improved_training.py +208 -0
render.yaml +6 -0
requirements.txt +8 -0
train_extracted.py +315 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+*.pyc
+.env
+venv/
+.DS_Store
+# Large Model Files
+best_model.pt
+label_encoder.pkl

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.9-slim
+# Set work directory
+WORKDIR /app
+# Install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the entire project
+COPY . .
+# Download model (bypasses GitHub LFS issues)
+RUN python download_model.py
+# Expose port for Render
+EXPOSE 8000
+# Run FastAPI via Uvicorn
+CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8000"]

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Code Complexity Predictor
+emoji: 📉
+colorFrom: blue
+colorTo: yellow
+sdk: gradio
+sdk_version: 6.9.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

Untitled2.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

backend/main.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+import torch
+import joblib
+from fastapi import FastAPI, HTTPException
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# Complexity descriptions
+DESCRIPTIONS = {
+    "constant":  ("O(1)", "⚡ Constant Time", "Executes in the same time regardless of input size. Very fast!"),
+    "linear":    ("O(n)", "📈 Linear Time", "Execution time grows linearly with input size."),
+    "logn":      ("O(log n)", "🔍 Logarithmic Time", "Very efficient! Common in binary search algorithms."),
+    "nlogn":     ("O(n log n)", "⚙️ Linearithmic Time", "Common in efficient sorting algorithms like merge sort."),
+    "quadratic": ("O(n²)", "🐢 Quadratic Time", "Execution time grows quadratically. Common in nested loops."),
+    "cubic":     ("O(n³)", "🦕 Cubic Time", "Triple nested loops. Avoid for large inputs."),
+    "np":        ("O(2ⁿ)", "💀 Exponential Time", "NP-Hard complexity. Only feasible for very small inputs."),
+}
+app = FastAPI(title="Code Complexity Predictor API")
+class PredictRequest(BaseModel):
+    code: str
+# Global state
+model = None
+tokenizer = None
+le = None
+device = None
+@app.on_event("startup")
+def load_resources():
+    global model, tokenizer, le, device
+    print("Loading resources...")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
+    # Load label encoder
+    if os.path.exists("label_encoder.pkl"):
+        le = joblib.load("label_encoder.pkl")
+    else:
+        print("WARNING: label_encoder.pkl not found!")
+    # Load model
+    model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=7)
+    if os.path.exists("best_model.pt"):
+        model.load_state_dict(torch.load("best_model.pt", map_location=device))
+    else:
+        print("WARNING: best_model.pt not found!")
+    model.to(device)
+    model.eval()
+    print("Resources loaded successfully!")
+@app.post("/api/predict")
+def predict_complexity(request: PredictRequest):
+    code = request.code
+    if not code.strip():
+        raise HTTPException(status_code=400, detail="Code cannot be empty")
+    try:
+        inputs = tokenizer(code, truncation=True, max_length=512, padding='max_length', return_tensors='pt')
+        input_ids = inputs['input_ids'].to(device)
+        attention_mask = inputs['attention_mask'].to(device)
+        with torch.no_grad():
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            pred = torch.argmax(outputs.logits, dim=1).item()
+        label = le.inverse_transform([pred])[0]
+        notation, title, description = DESCRIPTIONS.get(label, (label, label, ""))
+        return {
+            "notation": notation,
+            "title": title,
+            "description": description
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# Mount frontend
+app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")

download_model.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import urllib.request
+import os
+def download_file(url, target_path):
+    print(f"Downloading {target_path} from {url}...")
+    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
+    with urllib.request.urlopen(req) as response, open(target_path, 'wb') as out_file:
+        out_file.write(response.read())
+    print(f"✅ Successfully downloaded {target_path} (Size: {os.path.getsize(target_path)/(1024*1024):.2f} MB)")
+url_model = "https://huggingface.co/spaces/raghuram00/code-complexity-predictor/resolve/main/best_model.pt"
+url_le = "https://huggingface.co/spaces/raghuram00/code-complexity-predictor/resolve/main/label_encoder.pkl"
+if not os.path.exists("best_model.pt"):
+    download_file(url_model, "best_model.pt")
+if not os.path.exists("label_encoder.pkl"):
+    download_file(url_le, "label_encoder.pkl")

frontend/index.html ADDED Viewed

	@@ -0,0 +1,99 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Code Complexity Predictor</title>
+    <!-- Modern Fonts -->
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&family=JetBrains+Mono:wght@400;700&family=Outfit:wght@600;800&display=swap" rel="stylesheet">
+    <!-- Prism.js for code syntax highlighting -->
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/themes/prism-tomorrow.min.css" rel="stylesheet" />
+    <!-- Custom Styles -->
+    <link rel="stylesheet" href="styles.css">
+</head>
+<body>
+    <div class="background-effects">
+        <div class="glow-orb orb-1"></div>
+        <div class="glow-orb orb-2"></div>
+    </div>
+    <main class="container">
+        <header>
+            <div class="logo-wrapper">
+                <svg width="40" height="40" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
+                    <path d="M10 20L14 4M18 8L22 12L18 16M6 16L2 12L6 8" stroke="url(#paint0_linear)" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
+                    <defs>
+                        <linearGradient id="paint0_linear" x1="2" y1="12" x2="22" y2="12" gradientUnits="userSpaceOnUse">
+                            <stop stop-color="#00F0FF" />
+                            <stop offset="1" stop-color="#8000FF" />
+                        </linearGradient>
+                    </defs>
+                </svg>
+                <h1>Complexity <span>Predictor</span></h1>
+            </div>
+            <p class="subtitle">// AI-powered algorithmic Big-O analysis using CodeBERT</p>
+        </header>
+        <div class="app-grid">
+            <!-- Left Side: Code Input -->
+            <section class="editor-section">
+                <div class="card glass-panel">
+                    <div class="card-header">
+                        <span class="dot red"></span>
+                        <span class="dot yellow"></span>
+                        <span class="dot green"></span>
+                        <span class="filename">algorithm.py</span>
+                    </div>
+                    <div class="editor-wrapper">
+                        <!-- We use a textarea over a pre code block to allow editing,
+                             but styling them to overlap for syntax highlighting -->
+                        <textarea id="codeInput" placeholder="# Paste your Python or Java code here...
+def example(arr):
+    for item in arr:
+        print(item)"></textarea>
+                    </div>
+                </div>
+                <button id="analyzeBtn" class="primary-btn">
+                    <span class="btn-text">⚡ Analyze Complexity</span>
+                    <span class="loader hidden"></span>
+                </button>
+                <div class="examples-section">
+                    <h3>Try these examples:</h3>
+                    <div class="example-chips">
+                        <button class="chip" data-example='def get_first(arr):\n    return arr[0]'>O(1) Constant</button>
+                        <button class="chip" data-example='def linear_search(arr, target):\n    for i in range(len(arr)):\n        if arr[i] == target:\n            return i\n    return -1'>O(n) Linear</button>
+                        <button class="chip" data-example='def bubble_sort(arr):\n    n = len(arr)\n    for i in range(n):\n        for j in range(0, n-i-1):\n            if arr[j] > arr[j+1]:\n                arr[j], arr[j+1] = arr[j+1], arr[j]'>O(n²) Quadratic</button>
+                    </div>
+                </div>
+            </section>
+            <!-- Right Side: Results -->
+            <section class="results-section">
+                <div class="result-card glass-panel">
+                    <h2 class="result-label">Big-O Notation</h2>
+                    <div class="result-value glow-text" id="resNotation">O(?)</div>
+                </div>
+                <div class="result-card glass-panel">
+                    <h2 class="result-label">Complexity Class</h2>
+                    <div class="result-value" id="resTitle">-</div>
+                </div>
+                <div class="result-card glass-panel desc-card">
+                    <h2 class="result-label">Explanation</h2>
+                    <p class="result-desc" id="resDesc">Awaiting code input to predict complexity.</p>
+                </div>
+            </section>
+        </div>
+    </main>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/prism.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/components/prism-python.min.js"></script>
+    <script src="script.js"></script>
+</body>
+</html>

frontend/script.js ADDED Viewed

	@@ -0,0 +1,91 @@

+document.addEventListener('DOMContentLoaded', () => {
+    const codeInput = document.getElementById('codeInput');
+    const analyzeBtn = document.getElementById('analyzeBtn');
+    const btnText = analyzeBtn.querySelector('.btn-text');
+    const loader = analyzeBtn.querySelector('.loader');
+    // Result elements
+    const resNotation = document.getElementById('resNotation');
+    const resTitle = document.getElementById('resTitle');
+    const resDesc = document.getElementById('resDesc');
+    // Example chips
+    document.querySelectorAll('.chip').forEach(chip => {
+        chip.addEventListener('click', () => {
+            // Replace \n from dataset with actual newlines
+            codeInput.value = chip.dataset.example.replace(/\\n/g, '\n');
+            triggerAnalysis();
+        });
+    });
+    analyzeBtn.addEventListener('click', triggerAnalysis);
+    // Also support Cmd/Ctrl + Enter to trigger
+    codeInput.addEventListener('keydown', (e) => {
+        if ((e.ctrlKey || e.metaKey) && e.key === 'Enter') {
+            triggerAnalysis();
+        }
+    });
+    async function triggerAnalysis() {
+        const code = codeInput.value.trim();
+        if (!code) {
+            resNotation.innerHTML = "O(?)";
+            resTitle.innerHTML = "Error";
+            resDesc.innerHTML = "⚠️ Please paste some code before analyzing!";
+            return;
+        }
+        // UI Loading state
+        analyzeBtn.disabled = true;
+        btnText.innerHTML = "Analyzing structure...";
+        loader.classList.remove('hidden');
+        resNotation.style.opacity = '0.5';
+        resTitle.style.opacity = '0.5';
+        resDesc.style.opacity = '0.5';
+        try {
+            const response = await fetch('/api/predict', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify({ code: code })
+            });
+            if (!response.ok) {
+                throw new Error(`Server error: ${response.status}`);
+            }
+            const data = await response.json();
+            // Render Results
+            resNotation.innerHTML = data.notation;
+            resTitle.innerHTML = data.title;
+            resDesc.innerHTML = data.description;
+        } catch (error) {
+            console.error('Analysis failed:', error);
+            resNotation.innerHTML = "O(?)";
+            resTitle.innerHTML = "Analysis Failed";
+            resDesc.innerHTML = "An error occurred while connecting to the AI model. Ensure the backend is running.";
+        } finally {
+            // Restore UI state
+            analyzeBtn.disabled = false;
+            btnText.innerHTML = "⚡ Analyze Complexity";
+            loader.classList.add('hidden');
+            // Fade results back in
+            resNotation.style.opacity = '1';
+            resTitle.style.opacity = '1';
+            resDesc.style.opacity = '1';
+            // Add a little pop animation to the results
+            document.querySelectorAll('.result-card').forEach(card => {
+                card.style.transform = 'scale(1.02)';
+                setTimeout(() => card.style.transform = 'scale(1)', 200);
+            });
+        }
+    }
+});

frontend/styles.css ADDED Viewed

	@@ -0,0 +1,333 @@

+:root {
+    --bg-color: #050508;
+    --panel-bg: rgba(22, 22, 30, 0.6);
+    --border-color: rgba(255, 255, 255, 0.08);
+    --text-main: #f0f0f5;
+    --text-dim: #8b8b9f;
+    --accent-primary: #00F0FF;
+    --accent-secondary: #8000FF;
+    --glass-blur: 16px;
+}
+* {
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+}
+body {
+    background-color: var(--bg-color);
+    color: var(--text-main);
+    font-family: 'Inter', sans-serif;
+    min-height: 100vh;
+    display: flex;
+    justify-content: center;
+    position: relative;
+    overflow-x: hidden;
+}
+/* Animated Background Orbs */
+.background-effects {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100vw;
+    height: 100vh;
+    z-index: -1;
+    overflow: hidden;
+    pointer-events: none;
+}
+.glow-orb {
+    position: absolute;
+    border-radius: 50%;
+    filter: blur(120px);
+    opacity: 0.4;
+    animation: float 20s infinite alternate ease-in-out;
+}
+.orb-1 {
+    width: 600px;
+    height: 600px;
+    background: var(--accent-secondary);
+    top: -200px;
+    right: -100px;
+}
+.orb-2 {
+    width: 500px;
+    height: 500px;
+    background: var(--accent-primary);
+    bottom: -200px;
+    left: -100px;
+    animation-delay: -10s;
+}
+@keyframes float {
+    0% { transform: translate(0, 0) scale(1); }
+    100% { transform: translate(-100px, 100px) scale(1.2); }
+}
+.container {
+    width: 100%;
+    max-width: 1200px;
+    padding: 3rem 2rem;
+    display: flex;
+    flex-direction: column;
+    gap: 3rem;
+}
+/* Header */
+header {
+    text-align: center;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    gap: 1rem;
+    animation: fadeDown 0.8s ease-out;
+}
+.logo-wrapper {
+    display: flex;
+    align-items: center;
+    gap: 1rem;
+}
+h1 {
+    font-family: 'Outfit', sans-serif;
+    font-size: 3.5rem;
+    font-weight: 800;
+    letter-spacing: -1px;
+}
+h1 span {
+    background: linear-gradient(135deg, var(--accent-primary), var(--accent-secondary));
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+}
+.subtitle {
+    font-family: 'JetBrains Mono', monospace;
+    color: var(--text-dim);
+    font-size: 0.9rem;
+}
+/* Main Grid */
+.app-grid {
+    display: grid;
+    grid-template-columns: 1.5fr 1fr;
+    gap: 2rem;
+    align-items: start;
+    animation: fadeUp 1s ease-out;
+}
+/* Glassmorphism Panels */
+.glass-panel {
+    background: var(--panel-bg);
+    backdrop-filter: blur(var(--glass-blur));
+    -webkit-backdrop-filter: blur(var(--glass-blur));
+    border: 1px solid var(--border-color);
+    border-radius: 20px;
+    box-shadow: 0 8px 32px 0 rgba(0, 0, 0, 0.3);
+}
+/* Editor Section */
+.card-header {
+    padding: 1rem;
+    border-bottom: 1px solid var(--border-color);
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    background: rgba(0,0,0,0.2);
+    border-radius: 20px 20px 0 0;
+}
+.dot {
+    width: 12px;
+    height: 12px;
+    border-radius: 50%;
+}
+.red { background: #ff5f56; }
+.yellow { background: #ffbd2e; }
+.green { background: #27c93f; }
+.filename {
+    margin-left: 1rem;
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 0.8rem;
+    color: var(--text-dim);
+}
+.editor-wrapper {
+    position: relative;
+    padding: 1rem;
+}
+textarea {
+    width: 100%;
+    min-height: 350px;
+    background: transparent;
+    border: none;
+    color: var(--text-main);
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 0.9rem;
+    line-height: 1.6;
+    resize: vertical;
+    outline: none;
+}
+textarea::placeholder {
+    color: rgba(255, 255, 255, 0.2);
+}
+/* Button */
+.primary-btn {
+    width: 100%;
+    margin-top: 1.5rem;
+    padding: 1.2rem;
+    border: none;
+    border-radius: 12px;
+    background: linear-gradient(135deg, var(--accent-primary), var(--accent-secondary));
+    color: #fff;
+    font-family: 'Outfit', sans-serif;
+    font-size: 1.2rem;
+    font-weight: 600;
+    cursor: pointer;
+    transition: all 0.3s ease;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    gap: 1rem;
+    position: relative;
+    overflow: hidden;
+}
+.primary-btn::before {
+    content: '';
+    position: absolute;
+    top: 0; left: -100%;
+    width: 100%; height: 100%;
+    background: linear-gradient(90deg, transparent, rgba(255,255,255,0.2), transparent);
+    transition: 0.5s;
+}
+.primary-btn:hover::before {
+    left: 100%;
+}
+.primary-btn:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 10px 20px rgba(128, 0, 255, 0.3);
+}
+.primary-btn:active {
+    transform: translateY(0);
+}
+/* Loader */
+.loader {
+    width: 20px;
+    height: 20px;
+    border: 3px solid rgba(255,255,255,0.3);
+    border-radius: 50%;
+    border-top-color: #fff;
+    animation: spin 1s ease-in-out infinite;
+}
+.hidden { display: none; }
+@keyframes spin {
+    to { transform: rotate(360deg); }
+}
+/* Examples */
+.examples-section {
+    margin-top: 2rem;
+}
+.examples-section h3 {
+    font-size: 0.9rem;
+    color: var(--text-dim);
+    margin-bottom: 1rem;
+    font-weight: 500;
+}
+.example-chips {
+    display: flex;
+    gap: 0.8rem;
+    flex-wrap: wrap;
+}
+.chip {
+    background: rgba(255,255,255,0.05);
+    border: 1px solid rgba(255,255,255,0.1);
+    color: var(--text-main);
+    padding: 0.5rem 1rem;
+    border-radius: 20px;
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 0.8rem;
+    cursor: pointer;
+    transition: all 0.2s;
+}
+.chip:hover {
+    background: rgba(255,255,255,0.1);
+    border-color: var(--accent-primary);
+}
+/* Results Section */
+.results-section {
+    display: flex;
+    flex-direction: column;
+    gap: 1.5rem;
+}
+.result-card {
+    padding: 2rem;
+    transition: transform 0.3s;
+}
+.result-card:hover {
+    transform: translateY(-2px);
+}
+.result-label {
+    font-size: 0.8rem;
+    text-transform: uppercase;
+    letter-spacing: 2px;
+    color: var(--text-dim);
+    margin-bottom: 1rem;
+}
+.result-value {
+    font-family: 'Outfit', sans-serif;
+    font-size: 2rem;
+    font-weight: 800;
+}
+.glow-text {
+    background: linear-gradient(135deg, var(--accent-primary), #fff);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    font-size: 3rem;
+}
+.result-desc {
+    color: #a0a0b5;
+    line-height: 1.6;
+    font-size: 1.1rem;
+}
+/* Animations */
+@keyframes fadeDown {
+    from { opacity: 0; transform: translateY(-20px); }
+    to { opacity: 1; transform: translateY(0); }
+}
+@keyframes fadeUp {
+    from { opacity: 0; transform: translateY(20px); }
+    to { opacity: 1; transform: translateY(0); }
+}
+/* Responsive */
+@media (max-width: 900px) {
+    .app-grid {
+        grid-template-columns: 1fr;
+    }
+    h1 { font-size: 2.5rem; }
+}

improved_training.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# ==============================================================================
+# 🚀 IMPROVED CODE COMPLEXITY PREDICTOR TRAINING SCRIPT 🚀
+# ==============================================================================
+# Run this entire script in Google Colab (either pasted into a cell or via script)
+# 1. Install dependencies
+# !pip install -q transformers datasets torch scikit-learn
+import pandas as pd
+import torch
+import torch.nn as nn
+from datasets import load_dataset
+from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import train_test_split
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers import get_linear_schedule_with_warmup
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import AdamW
+from tqdm import tqdm
+import os
+import shutil
+# ------------------------------------------------------------------------------
+# ⚙️ CONFIGURATION & HYPERPARAMETERS
+# ------------------------------------------------------------------------------
+MODEL_NAME = "microsoft/graphcodebert-base"  # 🔥 Upgraded to GraphCodeBERT
+MAX_LEN = 512                                # Max token length
+BATCH_SIZE = 16                              # Training batch size
+EPOCHS = 15                                  # 🔥 Increased from 3 to 15
+LEARNING_RATE = 3e-5                         # Optimized initial learning rate
+WEIGHT_DECAY = 0.05                          # 🔥 Increased Regularization
+PATIENCE = 3                                 # 🔥 Early Stopping patience
+SAVE_PATH = "best_model.pt"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"🖥️ Using device: {device}")
+# ------------------------------------------------------------------------------
+# 📊 DATA PREPARATION
+# ------------------------------------------------------------------------------
+print("\n[1/5] Loading Dataset...")
+dataset = load_dataset("codeparrot/codecomplex")
+df = pd.DataFrame(dataset['train'])
+# Encode labels
+le = LabelEncoder()
+df['label'] = le.fit_transform(df['complexity'])
+# Save Label Encoder for Inference
+import joblib
+joblib.dump(le, "label_encoder.pkl")
+# Train/Test Split (stratified)
+train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
+# Calculate Class Weights to handle imbalance right from the start
+class_counts = train_df['label'].value_counts().sort_index().values
+total_samples = sum(class_counts)
+class_weights = torch.tensor([total_samples / c for c in class_counts], dtype=torch.float).to(device)
+print(f"✅ Loaded {len(train_df)} training and {len(test_df)} testing samples.")
+# ------------------------------------------------------------------------------
+# 🧠 TOKENIZATION & DATASETS
+# ------------------------------------------------------------------------------
+print(f"\n[2/5] Initializing Tokenizer ({MODEL_NAME})...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+class CodeDataset(Dataset):
+    def __init__(self, dataframe, tokenizer, max_length=MAX_LEN):
+        self.data = dataframe
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        code = str(self.data.iloc[idx]['src'])
+        label = int(self.data.iloc[idx]['label'])
+        encoding = self.tokenizer(
+            code,
+            truncation=True,
+            max_length=self.max_length,
+            padding='max_length',
+            return_tensors='pt'
+        )
+        return {
+            'input_ids': encoding['input_ids'].squeeze(),
+            'attention_mask': encoding['attention_mask'].squeeze(),
+            'label': torch.tensor(label, dtype=torch.long)
+        }
+train_dataset = CodeDataset(train_df.reset_index(drop=True), tokenizer)
+test_dataset = CodeDataset(test_df.reset_index(drop=True), tokenizer)
+train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
+# ------------------------------------------------------------------------------
+# 🏗️ MODEL INITIALIZATION
+# ------------------------------------------------------------------------------
+print(f"\n[3/5] Loading Model ({MODEL_NAME})...")
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=7)
+model = model.to(device)
+# Optimizer with Weight Decay
+optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
+# Scheduler
+total_steps = len(train_loader) * EPOCHS
+scheduler = get_linear_schedule_with_warmup(
+    optimizer,
+    num_warmup_steps=int(total_steps * 0.1),  # 10% warmup
+    num_training_steps=total_steps
+)
+# Loss function with balanced classes
+criterion = nn.CrossEntropyLoss(weight=class_weights)
+# ------------------------------------------------------------------------------
+# 🏃 TRAINING & EVALUATION FUNCTIONS
+# ------------------------------------------------------------------------------
+def train_epoch(model, loader, optimizer, scheduler, criterion, device):
+    model.train()
+    total_loss, correct, total = 0, 0, 0
+    for batch in tqdm(loader, desc="Training", leave=False):
+        input_ids = batch['input_ids'].to(device)
+        attention_mask = batch['attention_mask'].to(device)
+        labels = batch['label'].to(device)
+        optimizer.zero_grad()
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        loss = criterion(outputs.logits, labels)
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+        scheduler.step()
+        total_loss += loss.item()
+        preds = torch.argmax(outputs.logits, dim=1)
+        correct += (preds == labels).sum().item()
+        total += labels.size(0)
+    return total_loss / len(loader), correct / total
+def evaluate(model, loader, device):
+    model.eval()
+    correct, total = 0, 0
+    with torch.no_grad():
+        for batch in tqdm(loader, desc="Evaluating", leave=False):
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            labels = batch['label'].to(device)
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            preds = torch.argmax(outputs.logits, dim=1)
+            correct += (preds == labels).sum().item()
+            total += labels.size(0)
+    return correct / total
+# ------------------------------------------------------------------------------
+# 🥇 MAIN TRAINING LOOP WITH EARLY STOPPING
+# ------------------------------------------------------------------------------
+print("\n[4/5] Starting Training Loop...")
+best_accuracy = 0
+epochs_no_improve = 0
+for epoch in range(EPOCHS):
+    print(f"\n🔄 Epoch {epoch+1}/{EPOCHS}")
+    train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, criterion, device)
+    test_acc = evaluate(model, test_loader, device)
+    print(f"📈 Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")
+    # Early Stopping Logic
+    if test_acc > best_accuracy:
+        best_accuracy = test_acc
+        epochs_no_improve = 0
+        torch.save(model.state_dict(), SAVE_PATH)
+        print(f"⭐ NEW BEST MODEL SAVED! Accuracy: {best_accuracy*100:.2f}%")
+    else:
+        epochs_no_improve += 1
+        print(f"⚠️ No improvement for {epochs_no_improve} epochs.")
+    if epochs_no_improve >= PATIENCE:
+        print(f"\n⏹️ EARLY STOPPING TRIGGERED! Test accuracy hasn't improved in {PATIENCE} epochs.")
+        break
+# ------------------------------------------------------------------------------
+# 💾 EXPORTING TO DRIVE
+# ------------------------------------------------------------------------------
+print("\n[5/5] Finalizing...")
+try:
+    from google.colab import drive
+    drive.mount('/content/drive', force_remount=True)
+    shutil.copy(SAVE_PATH, f"/content/drive/MyDrive/{SAVE_PATH}")
+    shutil.copy("label_encoder.pkl", "/content/drive/MyDrive/label_encoder.pkl")
+    print("✅ Files successfully backed up to Google Drive!")
+except ImportError:
+    print("Not running in Colab - skipping Drive export.")

render.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+services:
+  - type: web
+    name: code-complexity-predictor
+    env: docker
+    instanceCT: 1
+    plan: free

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi
+uvicorn
+transformers
+torch
+scikit-learn
+pandas
+joblib
+python-multipart

train_extracted.py ADDED Viewed

	@@ -0,0 +1,315 @@

+!pip install transformers datasets torch scikit-learn
+# --- CELL ---
+from datasets import load_dataset
+dataset = load_dataset("codeparrot/codecomplex")
+print(dataset)
+print(dataset['train'][0])
+# --- CELL ---
+import pandas as pd
+df = pd.DataFrame(dataset['train'])
+# Check complexity labels
+print("Complexity classes:")
+print(df['complexity'].value_counts())
+print("\nLanguages:")
+print(df['from'].value_counts())
+print("\nTotal samples:", len(df))
+# --- CELL ---
+from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import train_test_split
+# Encode labels
+le = LabelEncoder()
+df['label'] = le.fit_transform(df['complexity'])
+print("Label mapping:")
+for i, cls in enumerate(le.classes_):
+    print(f"  {cls} → {i}")
+# Split data
+train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
+print(f"\nTrain size: {len(train_df)}")
+print(f"Test size: {len(test_df)}")
+# --- CELL ---
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
+print("✅ CodeBERT tokenizer loaded!")
+# Test it
+sample = df['src'][0][:200]
+tokens = tokenizer(sample, truncation=True, max_length=512, return_tensors="pt")
+print("Sample token shape:", tokens['input_ids'].shape)
+# --- CELL ---
+import torch
+from torch.utils.data import Dataset
+class CodeDataset(Dataset):
+    def __init__(self, dataframe, tokenizer, max_length=512):
+        self.data = dataframe
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        code = str(self.data.iloc[idx]['src'])
+        label = int(self.data.iloc[idx]['label'])
+        encoding = self.tokenizer(
+            code,
+            truncation=True,
+            max_length=self.max_length,
+            padding='max_length',
+            return_tensors='pt'
+        )
+        return {
+            'input_ids': encoding['input_ids'].squeeze(),
+            'attention_mask': encoding['attention_mask'].squeeze(),
+            'label': torch.tensor(label, dtype=torch.long)
+        }
+# Create datasets
+train_dataset = CodeDataset(train_df.reset_index(drop=True), tokenizer)
+test_dataset = CodeDataset(test_df.reset_index(drop=True), tokenizer)
+print(f"✅ Train dataset: {len(train_dataset)} samples")
+print(f"✅ Test dataset: {len(test_dataset)} samples")
+# --- CELL ---
+from transformers import AutoModelForSequenceClassification
+import torch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+model = AutoModelForSequenceClassification.from_pretrained(
+    "microsoft/codebert-base",
+    num_labels=7
+)
+model = model.to(device)
+print("✅ CodeBERT model loaded!")
+print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
+# --- CELL ---
+from torch.utils.data import DataLoader
+from torch.optim import AdamW
+from transformers import get_linear_schedule_with_warmup
+# DataLoaders
+train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
+test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
+# Optimizer
+optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
+# Scheduler
+total_steps = len(train_loader) * 3  # 3 epochs
+scheduler = get_linear_schedule_with_warmup(
+    optimizer,
+    num_warmup_steps=total_steps // 10,
+    num_training_steps=total_steps
+)
+print(f"✅ DataLoaders ready!")
+print(f"Total training steps: {total_steps}")
+print(f"Steps per epoch: {len(train_loader)}")
+# --- CELL ---
+from tqdm import tqdm
+def train_epoch(model, loader, optimizer, scheduler, device):
+    model.train()
+    total_loss = 0
+    correct = 0
+    total = 0
+    for batch in tqdm(loader, desc="Training"):
+        input_ids = batch['input_ids'].to(device)
+        attention_mask = batch['attention_mask'].to(device)
+        labels = batch['label'].to(device)
+        optimizer.zero_grad()
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+        loss = outputs.loss
+        logits = outputs.logits
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+        scheduler.step()
+        total_loss += loss.item()
+        preds = torch.argmax(logits, dim=1)
+        correct += (preds == labels).sum().item()
+        total += labels.size(0)
+    return total_loss / len(loader), correct / total
+def evaluate(model, loader, device):
+    model.eval()
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for batch in tqdm(loader, desc="Evaluating"):
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            labels = batch['label'].to(device)
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            preds = torch.argmax(outputs.logits, dim=1)
+            correct += (preds == labels).sum().item()
+            total += labels.size(0)
+    return correct / total
+# Train for 3 epochs
+best_accuracy = 0
+for epoch in range(3):
+    print(f"\n🔄 Epoch {epoch+1}/3")
+    train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device)
+    test_acc = evaluate(model, test_loader, device)
+    print(f"Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")
+    if test_acc > best_accuracy:
+        best_accuracy = test_acc
+        torch.save(model.state_dict(), "best_model.pt")
+        print(f"✅ Best model saved! Accuracy: {best_accuracy*100:.2f}%")
+# --- CELL ---
+# Train 2 more epochs
+for epoch in range(2):
+    print(f"\n🔄 Epoch {epoch+4}/5")
+    train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device)
+    test_acc = evaluate(model, test_loader, device)
+    print(f"Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")
+    if test_acc > best_accuracy:
+        best_accuracy = test_acc
+        torch.save(model.state_dict(), "best_model.pt")
+        print(f"✅ Best model saved! Accuracy: {best_accuracy*100:.2f}%")
+# --- CELL ---
+from google.colab import drive
+drive.mount('/content/drive')
+# --- CELL ---
+import shutil
+# Copy files to Google Drive
+shutil.copy("best_model.pt", "/content/drive/MyDrive/best_model.pt")
+shutil.copy("label_encoder.pkl", "/content/drive/MyDrive/label_encoder.pkl")
+print("✅ Files saved to Google Drive!")
+# --- CELL ---
+# Test the model directly in Colab
+test_codes = [
+    "public int findMax(int[] arr) { int max = arr[0]; for (int i = 1; i < arr.length; i++) { if (arr[i] > max) max = arr[i]; } return max; }",
+    "return arr[0];",
+    "for(int i=0;i<n;i++) for(int j=0;j<n;j++) sum+=arr[i][j];",
+]
+for code in test_codes:
+    inputs = tokenizer(code, truncation=True, max_length=512, padding='max_length', return_tensors='pt')
+    input_ids = inputs['input_ids'].to(device)
+    attention_mask = inputs['attention_mask'].to(device)
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        pred = torch.argmax(outputs.logits, dim=1).item()
+    print(f"Code: {code[:50]}...")
+    print(f"Predicted: {le.inverse_transform([pred])[0]}\n")
+# --- CELL ---
+import torch.nn as nn
+# Count class frequencies
+class_counts = df['label'].value_counts().sort_index().values
+total = sum(class_counts)
+class_weights = torch.tensor([total/c for c in class_counts], dtype=torch.float).to(device)
+print("Class weights:", class_weights)
+# New training loop with weighted loss
+def train_epoch_weighted(model, loader, optimizer, scheduler, device, weights):
+    model.train()
+    total_loss = 0
+    correct = 0
+    total = 0
+    criterion = nn.CrossEntropyLoss(weight=weights)
+    for batch in tqdm(loader, desc="Training"):
+        input_ids = batch['input_ids'].to(device)
+        attention_mask = batch['attention_mask'].to(device)
+        labels = batch['label'].to(device)
+        optimizer.zero_grad()
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        loss = criterion(outputs.logits, labels)
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+        scheduler.step()
+        total_loss += loss.item()
+        preds = torch.argmax(outputs.logits, dim=1)
+        correct += (preds == labels).sum().item()
+        total += labels.size(0)
+    return total_loss / len(loader), correct / total
+# Retrain with weights
+optimizer3 = AdamW(model.parameters(), lr=5e-6)
+scheduler3 = get_linear_schedule_with_warmup(optimizer3, num_warmup_steps=30, num_training_steps=len(train_loader)*3)
+for epoch in range(3):
+    print(f"\n🔄 Epoch {epoch+1}/3")
+    train_loss, train_acc = train_epoch_weighted(model, train_loader, optimizer3, scheduler3, device, class_weights)
+    test_acc = evaluate(model, test_loader, device)
+    print(f"Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")
+    if test_acc > best_accuracy:
+        best_accuracy = test_acc
+        torch.save(model.state_dict(), "best_model.pt")
+        print(f"✅ Best model saved! Accuracy: {best_accuracy*100:.2f}%")
+# --- CELL ---
+import shutil
+shutil.copy("best_model.pt", "/content/drive/MyDrive/best_model.pt")
+print("✅ Saved to Google Drive!")