arun3676 commited on
Commit
ff3ea6b
·
1 Parent(s): 5c6379b

Add Hugging Face integration and remote model support

Browse files

- Created Hugging Face Space files (app.py, requirements.txt, README.md)
- Enhanced optimized_code_analyzer_enhanced.py with remote API support
- Updated matrix_final.py with model selection UI
- Added remote model configuration and connection testing
- Support for both local CodeT5+ and remote fine-tuned DeepSeek models

hf-space/README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Fine-tuned Code Analyzer API
3
+ emoji: 🤖
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ # Fine-tuned Code Analyzer API
11
+
12
+ API endpoint for code analysis using fine-tuned DeepSeek model.
13
+
14
+ ## Features
15
+
16
+ - **Quality Scores**: 1-100 rating for code quality
17
+ - **Structured Analysis**: Bugs, Performance, Security sections
18
+ - **Code Improvements**: Specific suggestions with examples
19
+ - **Professional Output**: Consistent, detailed analysis format
20
+
21
+ ## Usage
22
+
23
+ ### POST /analyze
24
+
25
+ Analyze code for bugs, performance, and security issues.
26
+
27
+ **Request:**
28
+ ```json
29
+ {
30
+ "code": "def fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)",
31
+ "max_tokens": 300
32
+ }
33
+ ```
34
+
35
+ **Response:**
36
+ ```json
37
+ {
38
+ "analysis": "Quality Score: 35/100\n\nBUGS:\n- No error handling\n- Infinite recursion possible\n\nPERFORMANCE ISSUES:\n- Recursive calls cause exponential time complexity\n\nSECURITY CONCERNS:\n- No input validation\n\nIMPROVEMENTS:\n1. Use memoization to avoid recursion\n2. Add input validation\n\nExample improved code:\n[Shows working fixes]",
39
+ "model": "fine-tuned-deepseek",
40
+ "status": "success"
41
+ }
42
+ ```
43
+
44
+ ### GET /health
45
+
46
+ Health check endpoint.
47
+
48
+ **Response:**
49
+ ```json
50
+ {
51
+ "status": "healthy",
52
+ "model": "fine-tuned-deepseek"
53
+ }
54
+ ```
55
+
56
+ ## Model Details
57
+
58
+ - **Base Model**: DeepSeek Coder 1.3B
59
+ - **Training Method**: LoRA (Low-Rank Adaptation)
60
+ - **Dataset**: 59+ high-quality code analysis examples
61
+ - **Fine-tuned for**: Code analysis, bug detection, performance optimization
hf-space/app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ from peft import PeftModel
5
+ import torch
6
+ import uvicorn
7
+
8
+ app = FastAPI()
9
+
10
+ # Load model once at startup
11
+ tokenizer = None
12
+ model = None
13
+
14
+ @app.on_event("startup")
15
+ async def load_model():
16
+ global tokenizer, model
17
+ print("🚀 Loading fine-tuned model...")
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained(
20
+ "deepseek-ai/deepseek-coder-1.3b-instruct",
21
+ trust_remote_code=True
22
+ )
23
+ tokenizer.pad_token = tokenizer.eos_token
24
+
25
+ base_model = AutoModelForCausalLM.from_pretrained(
26
+ "deepseek-ai/deepseek-coder-1.3b-instruct",
27
+ torch_dtype=torch.float16,
28
+ device_map="auto",
29
+ trust_remote_code=True,
30
+ )
31
+
32
+ model = PeftModel.from_pretrained(
33
+ base_model,
34
+ "arun3676/fine-tuned-code-analyzer"
35
+ )
36
+ print("✅ Model loaded successfully!")
37
+
38
+ class CodeRequest(BaseModel):
39
+ code: str
40
+ max_tokens: int = 300
41
+
42
+ class AnalysisResponse(BaseModel):
43
+ analysis: str
44
+ model: str
45
+ status: str
46
+
47
+ @app.post("/analyze", response_model=AnalysisResponse)
48
+ async def analyze_code(request: CodeRequest):
49
+ try:
50
+ prompt = f"<s>[INST] Analyze this code for bugs, performance, and security issues. Give a quality score from 1-100 and provide a detailed analysis.\n\nCode:\n```\n{request.code}\n``` [/INST]"
51
+
52
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
53
+
54
+ with torch.no_grad():
55
+ outputs = model.generate(
56
+ **inputs,
57
+ max_new_tokens=request.max_tokens,
58
+ temperature=0.7,
59
+ do_sample=True,
60
+ pad_token_id=tokenizer.eos_token_id,
61
+ eos_token_id=tokenizer.eos_token_id,
62
+ )
63
+
64
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
65
+ analysis = response.split('[/INST]')[-1].strip()
66
+
67
+ return AnalysisResponse(
68
+ analysis=analysis,
69
+ model="fine-tuned-deepseek",
70
+ status="success"
71
+ )
72
+ except Exception as e:
73
+ raise HTTPException(status_code=500, detail=str(e))
74
+
75
+ @app.get("/health")
76
+ async def health_check():
77
+ return {"status": "healthy", "model": "fine-tuned-deepseek"}
78
+
79
+ @app.get("/")
80
+ async def root():
81
+ return {
82
+ "message": "Fine-tuned Code Analyzer API",
83
+ "endpoints": {
84
+ "POST /analyze": "Analyze code for bugs, performance, and security issues",
85
+ "GET /health": "Health check endpoint"
86
+ },
87
+ "model": "fine-tuned-deepseek"
88
+ }
89
+
90
+ if __name__ == "__main__":
91
+ uvicorn.run(app, host="0.0.0.0", port=7860)
hf-space/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ transformers
4
+ peft
5
+ torch
6
+ accelerate
7
+ requests
matrix_final.py CHANGED
@@ -5,7 +5,7 @@ import random
5
  import sys
6
  from dotenv import load_dotenv
7
  from analyzer import CodeAnalyzer
8
- from optimized_code_analyzer import OptimizedCodeAnalyzer
9
 
10
  # Load environment variables
11
  load_dotenv()
@@ -281,15 +281,21 @@ def get_analyzer():
281
 
282
  analyzer = get_analyzer()
283
 
284
- # Local CodeT5+ analyzer (cached)
285
  @st.cache_resource
286
- def get_local_analyzer():
287
- return OptimizedCodeAnalyzer(
288
- model_id="Salesforce/codet5p-220m",
289
- precision="fp16", # fastest from benchmark
290
- quick_max_new_tokens=180,
291
- detailed_max_new_tokens=240,
292
- )
 
 
 
 
 
 
293
 
294
  def display_matrix_analysis_result(result: dict, model_name: str):
295
  """Display analysis result in clean, readable horizontal blocks."""
@@ -613,16 +619,52 @@ with st.sidebar:
613
  format_func=lambda x: f"📝 {x}" if x == "Code Analysis" else f"📦 {x}"
614
  )
615
 
616
- # Local model toggle and preset
617
- use_local = st.checkbox("💻 Use Local CodeT5+ (no external API)", value=False)
618
- local_preset = st.selectbox(
619
- "Local Inference Mode",
620
- ["Quick", "Detailed"],
621
- index=0,
622
- help="Quick = beams 1, ~180 tokens. Detailed = beams 2, ~240 tokens.",
623
- disabled=not use_local,
 
624
  )
625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
626
  if analysis_mode == "GitHub Repository":
627
  st.markdown("#### Repository Analysis")
628
  github_url = st.text_input(
@@ -855,10 +897,20 @@ with col2:
855
 
856
  else:
857
  # Single model analysis
858
- if use_local:
859
- st.markdown("#### 🤖 CODET5+_LOCAL_ANALYSIS")
860
- local = get_local_analyzer()
861
- if local_preset == "Quick":
 
 
 
 
 
 
 
 
 
 
862
  result = local.analyze_code_fast(code_input, mode="quick")
863
  # adapt to display format
864
  display_matrix_analysis_result({
@@ -871,10 +923,10 @@ with col2:
871
  "language": "auto",
872
  "line_count": len(code_input.splitlines()),
873
  "raw_response": result["analysis"],
874
- }, "CodeT5+ Local (Quick)")
875
  else:
876
  # streaming path – consume generator and show final
877
- local = get_local_analyzer()
878
  final_text = None
879
  for chunk in local.analyze_code_streaming(code_input, show_progress=True, mode="detailed"):
880
  final_text = chunk
@@ -888,7 +940,7 @@ with col2:
888
  "language": "auto",
889
  "line_count": len(code_input.splitlines()),
890
  "raw_response": final_text or "",
891
- }, "CodeT5+ Local (Detailed)")
892
  else:
893
  st.markdown(f"#### 🤖 {available_models[selected_model].upper()}_ANALYSIS")
894
  result = analyzer.analyze_code(
 
5
  import sys
6
  from dotenv import load_dotenv
7
  from analyzer import CodeAnalyzer
8
+ from optimized_code_analyzer_enhanced import EnhancedCodeAnalyzer
9
 
10
  # Load environment variables
11
  load_dotenv()
 
281
 
282
  analyzer = get_analyzer()
283
 
284
+ # Local analyzer (cached)
285
  @st.cache_resource
286
+ def get_local_analyzer(model_type="codet5", remote_url=None):
287
+ if remote_url:
288
+ return EnhancedCodeAnalyzer(
289
+ model_type="deepseek-finetuned-remote",
290
+ remote_api_url=remote_url
291
+ )
292
+ else:
293
+ return EnhancedCodeAnalyzer(
294
+ model_type=model_type,
295
+ precision="fp16",
296
+ quick_max_new_tokens=180,
297
+ detailed_max_new_tokens=300,
298
+ )
299
 
300
  def display_matrix_analysis_result(result: dict, model_name: str):
301
  """Display analysis result in clean, readable horizontal blocks."""
 
619
  format_func=lambda x: f"📝 {x}" if x == "Code Analysis" else f"📦 {x}"
620
  )
621
 
622
+ # Model Selection
623
+ st.markdown("#### 🤖 AI Model Selection")
624
+ model_choice = st.radio(
625
+ "Choose Analysis Model:",
626
+ [
627
+ "CodeT5+ (Fast - Local)",
628
+ "Fine-tuned DeepSeek (Accurate - Remote)"
629
+ ],
630
+ help="Local models run on your computer, Remote model runs on Hugging Face (always available)"
631
  )
632
 
633
+ # Remote model configuration
634
+ remote_api_url = None
635
+ if "Remote" in model_choice:
636
+ st.markdown("#### 🌐 Remote Model Configuration")
637
+ remote_api_url = st.text_input(
638
+ "Hugging Face Space URL:",
639
+ value="https://arun3676-fine-tuned-code-analyzer.hf.space",
640
+ help="Your Hugging Face Space URL"
641
+ )
642
+
643
+ # Test connection
644
+ if st.button("🔗 Test Connection"):
645
+ try:
646
+ import requests
647
+ response = requests.get(f"{remote_api_url}/health", timeout=5)
648
+ if response.status_code == 200:
649
+ st.success("✅ Connected to remote model!")
650
+ else:
651
+ st.error("❌ Connection failed")
652
+ except:
653
+ st.error("❌ Cannot reach remote model")
654
+
655
+ # Local model toggle and preset (for CodeT5+)
656
+ if "CodeT5+" in model_choice:
657
+ use_local = True
658
+ local_preset = st.selectbox(
659
+ "Local Inference Mode",
660
+ ["Quick", "Detailed"],
661
+ index=0,
662
+ help="Quick = beams 1, ~180 tokens. Detailed = beams 2, ~240 tokens.",
663
+ )
664
+ else:
665
+ use_local = False
666
+ local_preset = "Detailed"
667
+
668
  if analysis_mode == "GitHub Repository":
669
  st.markdown("#### Repository Analysis")
670
  github_url = st.text_input(
 
897
 
898
  else:
899
  # Single model analysis
900
+ if use_local or "Remote" in model_choice:
901
+ # Determine model type and display name
902
+ if "Remote" in model_choice:
903
+ st.markdown("#### 🤖 FINE-TUNED_DEEPSEEK_REMOTE_ANALYSIS")
904
+ model_type = "deepseek-finetuned-remote"
905
+ display_name = "Fine-tuned DeepSeek (Remote)"
906
+ else:
907
+ st.markdown("#### 🤖 CODET5+_LOCAL_ANALYSIS")
908
+ model_type = "codet5"
909
+ display_name = "CodeT5+ Local"
910
+
911
+ local = get_local_analyzer(model_type, remote_api_url)
912
+
913
+ if local_preset == "Quick" or "Remote" in model_choice:
914
  result = local.analyze_code_fast(code_input, mode="quick")
915
  # adapt to display format
916
  display_matrix_analysis_result({
 
923
  "language": "auto",
924
  "line_count": len(code_input.splitlines()),
925
  "raw_response": result["analysis"],
926
+ }, f"{display_name} (Quick)")
927
  else:
928
  # streaming path – consume generator and show final
929
+ local = get_local_analyzer(model_type, remote_api_url)
930
  final_text = None
931
  for chunk in local.analyze_code_streaming(code_input, show_progress=True, mode="detailed"):
932
  final_text = chunk
 
940
  "language": "auto",
941
  "line_count": len(code_input.splitlines()),
942
  "raw_response": final_text or "",
943
+ }, f"{display_name} (Detailed)")
944
  else:
945
  st.markdown(f"#### 🤖 {available_models[selected_model].upper()}_ANALYSIS")
946
  result = analyzer.analyze_code(
optimized_code_analyzer_enhanced.py CHANGED
@@ -39,6 +39,7 @@ class EnhancedCodeAnalyzer:
39
  precision: str = "fp16",
40
  quick_max_new_tokens: int = 180,
41
  detailed_max_new_tokens: int = 300,
 
42
  ):
43
  """
44
  Initialize the enhanced analyzer.
@@ -77,6 +78,7 @@ class EnhancedCodeAnalyzer:
77
  self.model = None
78
  self.tokenizer = None
79
  self.cache = {}
 
80
 
81
  # Create cache directory
82
  os.makedirs(cache_dir, exist_ok=True)
@@ -345,6 +347,10 @@ Code:
345
  Returns:
346
  Dict: Analysis result
347
  """
 
 
 
 
348
  # Check cache first
349
  cached_result = self._check_cache(code)
350
  if cached_result:
@@ -459,6 +465,51 @@ Code:
459
 
460
  return min(score, 100)
461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  def get_model_info(self) -> Dict[str, Any]:
463
  """Get information about the loaded model."""
464
  if self.model is None:
 
39
  precision: str = "fp16",
40
  quick_max_new_tokens: int = 180,
41
  detailed_max_new_tokens: int = 300,
42
+ remote_api_url: Optional[str] = None,
43
  ):
44
  """
45
  Initialize the enhanced analyzer.
 
78
  self.model = None
79
  self.tokenizer = None
80
  self.cache = {}
81
+ self.remote_api_url = remote_api_url
82
 
83
  # Create cache directory
84
  os.makedirs(cache_dir, exist_ok=True)
 
347
  Returns:
348
  Dict: Analysis result
349
  """
350
+ # Check if using remote model
351
+ if self.remote_api_url:
352
+ return self.analyze_code_remote(code, mode)
353
+
354
  # Check cache first
355
  cached_result = self._check_cache(code)
356
  if cached_result:
 
465
 
466
  return min(score, 100)
467
 
468
+ def analyze_code_remote(self, code: str, mode: str = "quick") -> Dict[str, Any]:
469
+ """Analyze code using remote Hugging Face API."""
470
+ import requests
471
+
472
+ if not self.remote_api_url:
473
+ raise ValueError("No remote API URL configured")
474
+
475
+ cached_result = self._check_cache(code)
476
+ if cached_result:
477
+ cached_result["cached"] = True
478
+ return cached_result
479
+
480
+ start_time = time.time()
481
+
482
+ try:
483
+ max_tokens = self.quick_max_new_tokens if mode == "quick" else self.detailed_max_new_tokens
484
+
485
+ response = requests.post(
486
+ f"{self.remote_api_url}/analyze",
487
+ json={"code": code, "max_tokens": max_tokens},
488
+ timeout=60
489
+ )
490
+ response.raise_for_status()
491
+
492
+ data = response.json()
493
+ analysis_text = data["analysis"]
494
+
495
+ quality_score = self._calculate_quality_score(analysis_text)
496
+ total_time = time.time() - start_time
497
+
498
+ result = {
499
+ "analysis": analysis_text,
500
+ "quality_score": quality_score,
501
+ "execution_time": total_time,
502
+ "model": "fine-tuned-deepseek-remote",
503
+ "model_type": "deepseek-finetuned-remote",
504
+ "cached": False
505
+ }
506
+
507
+ self._save_to_cache(code, result)
508
+ return result
509
+
510
+ except Exception as e:
511
+ raise Exception(f"Remote analysis failed: {e}")
512
+
513
  def get_model_info(self) -> Dict[str, Any]:
514
  """Get information about the loaded model."""
515
  if self.model is None: