Spaces:

codey-lab
/

Check-Git

Running

App Files Files Community

Alibrown commited on Feb 6

Commit

094916c

verified ·

1 Parent(s): bf2c919

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +334 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,336 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+# FREE to use under MIT + ESOL v.1.0
+# See https://github.com/volkansah
 import streamlit as st
+import requests
+import re
+import os
+import tempfile
+from typing import Dict, List, Tuple
+import json
+from huggingface_hub import InferenceClient
+# ============================================
+# STREAMLIT PERMISSION FIX
+# ============================================
+TEMP_STREAMLIT_HOME = os.path.join(tempfile.gettempdir(), "st_config_workaround")
+os.makedirs(TEMP_STREAMLIT_HOME, exist_ok=True)
+os.environ["STREAMLIT_HOME"] = TEMP_STREAMLIT_HOME
+os.environ["STREAMLIT_GATHER_USAGE_STATS"] = "false"
+CONFIG_PATH = os.path.join(TEMP_STREAMLIT_HOME, "config.toml")
+if not os.path.exists(CONFIG_PATH):
+    with open(CONFIG_PATH, "w") as f:
+        f.write("[browser]\ngatherUsageStats = false\n")
+# ============================================
+# LLM-POWERED ANALYZER
+# ============================================
+class MLRepoAnalyzerLLM:
+    def __init__(self, hf_token: str = None):
+        self.hf_token = hf_token
+        if hf_token:
+            self.client = InferenceClient(token=hf_token)
+        # Fallback patterns (wenn kein Token)
+        self.fake_indicators = [
+            r'openai\.', r'anthropic\.', r'cohere\.',
+            r'replicate\.', r'api\.mistral', r'groq\.',
+            r'requests\.post.*api', r'urllib.*api'
+        ]
+        self.legit_indicators = [
+            r'torch\.optim', r'loss\.backward\(\)', r'model\.train\(\)',
+            r'optimizer\.step\(\)', r'tf\.keras\.optimizers',
+            r'from\s+transformers\s+import\s+Trainer',
+            r'accelerator\.backward', r'DeepSpeed',
+            r'torch\.nn\.Module', r'forward\(self'
+        ]
+    def extract_repo_info(self, url: str) -> Tuple[str, str, str]:
+        """Extract owner, repo, branch from GitHub URL"""
+        pattern = r'github\.com/([^/]+)/([^/]+)(?:/tree/([^/]+))?'
+        match = re.search(pattern, url)
+        if not match:
+            raise ValueError("Invalid GitHub URL")
+        owner, repo = match.group(1), match.group(2)
+        branch = match.group(3) or 'main'
+        return owner, repo.replace('.git', ''), branch
+    def fetch_repo_tree(self, owner: str, repo: str, branch: str) -> List[Dict]:
+        """Fetch file tree via GitHub API"""
+        api_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
+        response = requests.get(api_url, timeout=10)
+        if response.status_code != 200:
+            raise Exception(f"GitHub API error: {response.status_code}")
+        return response.json().get('tree', [])
+    def fetch_file_content(self, owner: str, repo: str, branch: str, path: str) -> str:
+        """Fetch raw file content"""
+        raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"
+        response = requests.get(raw_url, timeout=10)
+        return response.text if response.status_code == 200 else ""
+    def analyze_with_llm(self, code_snippet: str, filename: str) -> Dict:
+        """Use HF Inference API to analyze code"""
+        if not self.hf_token:
+            return None
+        prompt = f"""Analyze this Python file from a machine learning repository: {filename}
+Code snippet:
+```python
+{code_snippet[:2000]}  # Limit to avoid token limits
+```
+Determine if this is:
+1. REAL ML TRAINING CODE (contains actual model training, backprop, optimizers)
+2. API WRAPPER (just calls external APIs like OpenAI, Anthropic, etc.)
+3. UNCLEAR
+Respond in JSON format:
+{{
+  "classification": "REAL_TRAINING|API_WRAPPER|UNCLEAR",
+  "confidence": 0-100,
+  "reasoning": "brief explanation",
+  "key_indicators": ["indicator1", "indicator2"]
+}}"""
+        try:
+            # Use Qwen2.5-Coder or similar code-focused model
+            response = self.client.chat_completion(
+                messages=[{"role": "user", "content": prompt}],
+                model="Qwen/Qwen2.5-Coder-32B-Instruct",  # Free on HF Inference
+                max_tokens=500,
+                temperature=0.1
+            )
+            result_text = response.choices[0].message.content
+            # Extract JSON (handle markdown code blocks)
+            json_match = re.search(r'```json\s*(\{.*?\})\s*```', result_text, re.DOTALL)
+            if json_match:
+                return json.loads(json_match.group(1))
+            else:
+                # Try direct parse
+                return json.loads(result_text)
+        except Exception as e:
+            st.warning(f"LLM analysis failed for {filename}: {e}")
+            return None
+    def analyze_file_structure(self, files: List[Dict]) -> Dict:
+        """Quick structure check"""
+        py_files = [f for f in files if f['path'].endswith('.py')]
+        return {
+            'has_train_script': any('train' in f['path'].lower() for f in py_files),
+            'has_model_files': any('model' in f['path'].lower() for f in py_files),
+            'has_config': any(f['path'].endswith(('.yaml', '.yml', '.json', '.toml')) for f in files),
+            'has_requirements': any('requirements' in f['path'] or 'pyproject.toml' in f['path'] for f in files),
+            'python_file_count': len(py_files)
+        }
+    def analyze_with_patterns(self, content: str) -> Tuple[int, int]:
+        """Fallback pattern matching"""
+        fake_score = sum(5 for pattern in self.fake_indicators if re.search(pattern, content, re.IGNORECASE))
+        legit_score = sum(10 for pattern in self.legit_indicators if re.search(pattern, content, re.IGNORECASE))
+        return fake_score, legit_score
+    def classify_repo(self, url: str, use_llm: bool = True) -> Dict:
+        """Main classification"""
+        try:
+            owner, repo, branch = self.extract_repo_info(url)
+            files = self.fetch_repo_tree(owner, repo, branch)
+            structure = self.analyze_file_structure(files)
+            py_files = [f for f in files if f['path'].endswith('.py')][:10]
+            llm_results = []
+            pattern_fake_score = 0
+            pattern_legit_score = 0
+            for file_info in py_files:
+                content = self.fetch_file_content(owner, repo, branch, file_info['path'])
+                if not content:
+                    continue
+                # LLM Analysis (if token available)
+                if use_llm and self.hf_token:
+                    llm_result = self.analyze_with_llm(content, file_info['path'])
+                    if llm_result:
+                        llm_results.append({
+                            'file': file_info['path'],
+                            'result': llm_result
+                        })
+                # Pattern fallback
+                fake, legit = self.analyze_with_patterns(content)
+                pattern_fake_score += fake
+                pattern_legit_score += legit
+            # Combine LLM + Pattern results
+            if llm_results:
+                llm_real_count = sum(1 for r in llm_results if r['result']['classification'] == 'REAL_TRAINING')
+                llm_fake_count = sum(1 for r in llm_results if r['result']['classification'] == 'API_WRAPPER')
+                # LLM gets more weight
+                total_score = (llm_real_count * 30) - (llm_fake_count * 30) + (pattern_legit_score - pattern_fake_score)
+            else:
+                total_score = pattern_legit_score - pattern_fake_score
+            # Verdict
+            if total_score > 30:
+                verdict = "✅ LEGIT - Real ML Training Code"
+                confidence = "High"
+            elif total_score > 0:
+                verdict = "⚠️ MIXED - Contains some training code"
+                confidence = "Medium"
+            else:
+                verdict = "❌ FAKE - API Wrapper / No Real Training"
+                confidence = "High"
+            return {
+                'verdict': verdict,
+                'confidence': confidence,
+                'score': total_score,
+                'structure': structure,
+                'llm_results': llm_results,
+                'pattern_scores': {
+                    'fake': pattern_fake_score,
+                    'legit': pattern_legit_score
+                },
+                'repo_info': f"{owner}/{repo}@{branch}"
+            }
+        except Exception as e:
+            return {'error': str(e)}
+# ============================================
+# STREAMLIT UI
+# ============================================
+st.set_page_config(page_title="ML Repo Detector 🔍", page_icon="🤖", layout="wide")
+st.title("🤖 ML Training Repo Analyzer (LLM-Powered)")
+st.markdown("**AI-powered detection of fake ML repos using your HuggingFace token**")
+# Token input in sidebar
+with st.sidebar:
+    st.markdown("### 🔑 HuggingFace Setup")
+    hf_token = st.text_input(
+        "HF Token (optional)",
+        type="password",
+        help="Get your free token at https://huggingface.co/settings/tokens"
+    )
+    use_llm = st.checkbox(
+        "Use LLM Analysis",
+        value=bool(hf_token),
+        disabled=not hf_token,
+        help="Requires HF token. Uses Qwen2.5-Coder for deep analysis"
+    )
+    st.markdown("---")
+    st.markdown("### 🛠️ Models Used")
+    if use_llm:
+        st.success("✅ Qwen2.5-Coder-32B (Free)")
+    else:
+        st.info("📊 Pattern Matching Only")
+    st.markdown("---")
+    st.markdown("### 💡 How it works")
+    st.markdown("""
+    **With LLM:**
+    - Deep code understanding
+    - Context-aware analysis
+    - Higher accuracy
+    **Without LLM:**
+    - Pattern matching
+    - Regex-based detection
+    - Still pretty good!
+    """)
+# Main interface
+analyzer = MLRepoAnalyzerLLM(hf_token=hf_token if hf_token else None)
+repo_url = st.text_input(
+    "GitHub Repository URL",
+    placeholder="https://github.com/username/repo",
+    help="Enter a public GitHub repository URL"
+)
+col1, col2 = st.columns([1, 4])
+with col1:
+    analyze_btn = st.button("🚀 Analyze", type="primary", use_container_width=True)
+if analyze_btn:
+    if not repo_url:
+        st.error("Enter a GitHub URL!")
+    else:
+        with st.spinner("🔍 Analyzing repository..." + (" (using LLM)" if use_llm else " (pattern matching)")):
+            result = analyzer.classify_repo(repo_url, use_llm=use_llm and bool(hf_token))
+            if 'error' in result:
+                st.error(f"❌ Error: {result['error']}")
+            else:
+                # Verdict
+                st.markdown("---")
+                col1, col2, col3 = st.columns([3, 1, 1])
+                with col1:
+                    st.markdown(f"## {result['verdict']}")
+                with col2:
+                    st.metric("Confidence", result['confidence'])
+                with col3:
+                    st.metric("Score", result['score'])
+                # LLM Results
+                if result.get('llm_results'):
+                    st.markdown("### 🤖 LLM Analysis Results")
+                    for llm_res in result['llm_results'][:5]:
+                        with st.expander(f"📄 {llm_res['file']}"):
+                            res = llm_res['result']
+                            col1, col2 = st.columns(2)
+                            with col1:
+                                classification = res.get('classification', 'UNKNOWN')
+                                if classification == 'REAL_TRAINING':
+                                    st.success(f"✅ {classification}")
+                                elif classification == 'API_WRAPPER':
+                                    st.error(f"❌ {classification}")
+                                else:
+                                    st.warning(f"⚠️ {classification}")
+                            with col2:
+                                st.metric("Confidence", f"{res.get('confidence', 0)}%")
+                            st.markdown(f"**Reasoning:** {res.get('reasoning', 'N/A')}")
+                            if res.get('key_indicators'):
+                                st.markdown("**Key Indicators:**")
+                                for indicator in res['key_indicators']:
+                                    st.markdown(f"- {indicator}")
+                # Pattern Analysis (fallback/additional)
+                st.markdown("### 📊 Pattern Analysis")
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.metric("Legit Patterns", result['pattern_scores']['legit'])
+                with col2:
+                    st.metric("Fake Patterns", result['pattern_scores']['fake'])
+                # Structure
+                st.markdown("### 📁 Repository Structure")
+                struct = result['structure']
+                cols = st.columns(4)
+                with cols[0]:
+                    st.metric("Python Files", struct['python_file_count'])
+                with cols[1]:
+                    st.write("✅" if struct['has_train_script'] else "❌", "train.py")
+                with cols[2]:
+                    st.write("✅" if struct['has_model_files'] else "❌", "model files")
+                with cols[3]:
+                    st.write("✅" if struct['has_config'] else "❌", "configs")
+# Footer
+st.markdown("---")
+st.markdown("**💡 Your HF token = your quota. No data stored. Analysis runs on HF's free inference API.**")