Spaces:

DeL-TaiseiOzaki
/

Repository_Scaner

Sleeping

App Files Files Community

DeL-TaiseiOzaki commited on Oct 30, 2024

Commit

b212889

1 Parent(s): ed30199

s

Browse files

Files changed (5) hide show

app.py +110 -93
config/settings.py +29 -2
core/file_scanner.py +37 -22
scan.sh +4 -45
services/llm_service.py +14 -31

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import streamlit as st
 import tempfile
 import git
 from pathlib import Path
 from datetime import datetime
-from services.llm_service import LLMService
 from core.file_scanner import FileScanner, FileInfo
-from typing import List
 st.set_page_config(
    page_title="Repository Code Analysis",
@@ -13,121 +15,135 @@ st.set_page_config(
    layout="wide"
 )
 st.markdown("""
 <style>
-   .stApp {
-       background-color: #0e1117;
-       color: #ffffff;
-   }
-   .chat-message {
-       padding: 1rem;
-       margin: 1rem 0;
-       border-radius: 0.5rem;
-   }
-   .assistant-message {
-       background-color: #1e2329;
-       color: #ffffff;
-   }
-   .stButton button {
-       background-color: #2ea44f;
-       color: #ffffff;
-   }
-   .stTextArea textarea {
-       background-color: #1e2329;
-       color: #ffffff;
-   }
 </style>
 """, unsafe_allow_html=True)
 def create_download_content(files: List[FileInfo]) -> str:
-   content = "# スキャン結果\n\n"
-   for file in files:
-       content += f"## {file.path}\n"
-       content += f"サイズ: {file.formatted_size}\n"
-       content += f"エンコーディング: {file.encoding or '不明'}\n\n"
-       if file.content:
-           content += f"```{file.extension[1:] if file.extension else ''}\n"
-           content += file.content
-           content += "\n```\n\n"
-   return content
 def clone_repository(repo_url: str) -> Path:
-   temp_dir = Path(tempfile.mkdtemp())
-   git.Repo.clone_from(repo_url, temp_dir)
-   return temp_dir
 if 'repo_content' not in st.session_state:
-   st.session_state.repo_content = None
 if 'temp_dir' not in st.session_state:
-   st.session_state.temp_dir = None
 if 'llm_service' not in st.session_state:
-   try:
-       st.session_state.llm_service = LLMService()
-   except ValueError as e:
-       st.error(str(e))
-       st.stop()
 st.title("🔍 リポジトリ解析・質問システム")
 with st.sidebar:
-    if not st.session_state.llm_service.settings.anthropic_api_key:
-        st.error("Anthropic API key is required")
-        st.stop()
-    st.write("Using Claude model")
-    # LLM機能の切り替え
-    use_llm = st.toggle("LLMによるコード解説を有効にする", value=True, key="use_llm")
-    st.divider()
-    st.subheader("📌 使い方")
-    if use_llm:
-        st.markdown("""
-        1. GitHubリポジトリのURLを入力
-        2. スキャンを実行
-        3. コードについて質問（最大5ターンの会話が可能）
-        """)
-    else:
-        st.markdown("""
-        1. GitHubリポジトリのURLを入力
-        2. スキャンを実行してコードを解析
-        """)
 repo_url = st.text_input(
    "GitHubリポジトリのURLを入力",
    placeholder="https://github.com/username/repository.git"
 )
 if st.button("スキャン開始", disabled=not repo_url):
-   try:
-       with st.spinner('リポジトリをクローン中...'):
-           temp_dir = clone_repository(repo_url)
-           st.session_state.temp_dir = temp_dir
-       with st.spinner('ファイルをスキャン中...'):
-           scanner = FileScanner(temp_dir)
-           files = scanner.scan_files()
-           st.session_state.repo_content = LLMService.format_code_content(files)
-       st.success(f"スキャン完了: {len(files)}個のファイルを検出")
-       scan_result = create_download_content(files)
-       st.download_button(
-           label="スキャン結果をダウンロード",
-           data=scan_result,
-           file_name=f"scan_result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md",
-           mime="text/markdown"
-       )
-       st.session_state.llm_service.clear_history()
-   except Exception as e:
-       st.error(f"エラーが発生しました: {str(e)}")
-if st.session_state.repo_content and st.session_state.use_llm:
     st.divider()
     st.subheader("💭 コードについて質問する")
     for message in st.session_state.llm_service.conversation_history:
         if message.role == "assistant":
             st.markdown(f'<div class="chat-message assistant-message">{message.content}</div>',
@@ -157,9 +173,10 @@ if st.session_state.repo_content and st.session_state.use_llm:
                 else:
                     st.rerun()
 if st.session_state.temp_dir and Path(st.session_state.temp_dir).exists():
-   try:
-       import shutil
-       shutil.rmtree(st.session_state.temp_dir)
-   except:
-       pass

 import streamlit as st
 import tempfile
 import git
+import os
 from pathlib import Path
 from datetime import datetime
+from config.settings import Settings
 from core.file_scanner import FileScanner, FileInfo
+from services.llm_service import LLMService
+from typing import List, Set
 st.set_page_config(
    page_title="Repository Code Analysis",
    layout="wide"
 )
+# ダークテーマの設定
 st.markdown("""
 <style>
+    .stApp {
+        background-color: #0e1117;
+        color: #ffffff;
+    }
+    .chat-message {
+        padding: 1rem;
+        margin: 1rem 0;
+        border-radius: 0.5rem;
+    }
+    .assistant-message {
+        background-color: #1e2329;
+        color: #ffffff;
+    }
 </style>
 """, unsafe_allow_html=True)
 def create_download_content(files: List[FileInfo]) -> str:
+    content = "# スキャン結果\n\n"
+    for file in files:
+        content += f"## {file.path}\n"
+        content += f"サイズ: {file.formatted_size}\n"
+        content += f"エンコーディング: {file.encoding or '不明'}\n\n"
+        if file.content:
+            content += f"```{file.extension[1:] if file.extension else ''}\n"
+            content += file.content
+            content += "\n```\n\n"
+    return content
 def clone_repository(repo_url: str) -> Path:
+    temp_dir = Path(tempfile.mkdtemp())
+    git.Repo.clone_from(repo_url, temp_dir)
+    return temp_dir
+# セッション状態の初期化
 if 'repo_content' not in st.session_state:
+    st.session_state.repo_content = None
 if 'temp_dir' not in st.session_state:
+    st.session_state.temp_dir = None
 if 'llm_service' not in st.session_state:
+    try:
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            st.error("ANTHROPIC_API_KEY環境変数が設定されていません")
+            st.stop()
+        st.session_state.llm_service = LLMService(api_key)
+    except Exception as e:
+        st.error(str(e))
+        st.stop()
+# メインのUIレイアウト
 st.title("🔍 リポジトリ解析・質問システム")
+# サイドバーの設定
 with st.sidebar:
+    st.subheader("📌 使い方")
+    st.markdown("""
+    1. スキャン対象の拡張子を選択
+    2. GitHubリポジトリのURLを入力
+    3. スキャンを実行
+    4. コードについて質問（最大5ターンの会話が可能）
+    """)
+    # スキャン対象の拡張子選択
+    st.subheader("🔍 スキャン対象の選択")
+    # 拡張子をカテゴリごとに表示
+    st.write("プログラミング言語:")
+    prog_exts = {'.py', '.js', '.ts', '.java', '.cpp', '.hpp', '.c', '.h', '.go', '.rs'}
+    selected_prog = {ext: st.checkbox(ext, value=True, key=f"prog_{ext}")
+                    for ext in prog_exts}
+    st.write("設定ファイル:")
+    config_exts = {'.json', '.yml', '.yaml', '.toml'}
+    selected_config = {ext: st.checkbox(ext, value=True, key=f"config_{ext}")
+                      for ext in config_exts}
+    st.write("ドキュメント:")
+    doc_exts = {'.md', '.txt'}
+    selected_doc = {ext: st.checkbox(ext, value=True, key=f"doc_{ext}")
+                   for ext in doc_exts}
+    # 選択された拡張子の集合を作成
+    selected_extensions = {ext for exts in [selected_prog, selected_config, selected_doc]
+                         for ext, selected in exts.items() if selected}
+# URLの入力
 repo_url = st.text_input(
    "GitHubリポジトリのURLを入力",
    placeholder="https://github.com/username/repository.git"
 )
+# スキャン実行ボタン
 if st.button("スキャン開始", disabled=not repo_url):
+    try:
+        with st.spinner('リポジトリをクローン中...'):
+            temp_dir = clone_repository(repo_url)
+            st.session_state.temp_dir = temp_dir
+        with st.spinner('ファイルをスキャン中...'):
+            scanner = FileScanner(temp_dir, selected_extensions)
+            files = scanner.scan_files()
+            st.session_state.repo_content = LLMService.format_code_content(files)
+        st.success(f"スキャン完了: {len(files)}個のファイルを検出")
+        # スキャン結果のダウンロードボタン
+        scan_result = create_download_content(files)
+        st.download_button(
+            label="スキャン結果をダウンロード",
+            data=scan_result,
+            file_name=f"scan_result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md",
+            mime="text/markdown"
+        )
+        # 新しいスキャン時に会話履歴をクリア
+        st.session_state.llm_service.clear_history()
+    except Exception as e:
+        st.error(f"エラーが発生しました: {str(e)}")
+# スキャン完了後の質問セクション
+if st.session_state.repo_content:
     st.divider()
     st.subheader("💭 コードについて質問する")
+    # 会話履歴の表示
     for message in st.session_state.llm_service.conversation_history:
         if message.role == "assistant":
             st.markdown(f'<div class="chat-message assistant-message">{message.content}</div>',
                 else:
                     st.rerun()
+# セッション終了時のクリーンアップ
 if st.session_state.temp_dir and Path(st.session_state.temp_dir).exists():
+    try:
+        import shutil
+        shutil.rmtree(st.session_state.temp_dir)
+    except:
+        pass

config/settings.py CHANGED Viewed

@@ -1,10 +1,37 @@
 from pathlib import Path
 from datetime import datetime
 class Settings:
     DEFAULT_OUTPUT_DIR = Path("output")
     TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S"
     @classmethod
     def get_timestamp(cls) -> str:
         return datetime.now().strftime(cls.TIMESTAMP_FORMAT)
@@ -14,5 +41,5 @@ class Settings:
         return cls.DEFAULT_OUTPUT_DIR / f"repo_clone_{timestamp}"
     @classmethod
-    def get_log_file(cls, timestamp: str) -> Path:
-        return cls.DEFAULT_OUTPUT_DIR / f"scan_log_{timestamp}.txt"

 from pathlib import Path
 from datetime import datetime
+from typing import Set
 class Settings:
+    # デフォルト設定
     DEFAULT_OUTPUT_DIR = Path("output")
     TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S"
+    # デフォルトのスキャン対象拡張子
+    DEFAULT_EXTENSIONS = {
+        # プログラミング言語
+        '.py',    # Python
+        '.js',    # JavaScript
+        '.ts',    # TypeScript
+        '.java',  # Java
+        '.cpp',   # C++
+        '.hpp',   # C++ Header
+        '.c',     # C
+        '.h',     # C Header
+        '.go',    # Go
+        '.rs',    # Rust
+        # 設定ファイル
+        '.json',  # JSON
+        '.yml',   # YAML
+        '.yaml',  # YAML
+        '.toml',  # TOML
+        # ドキュメント
+        '.md',    # Markdown
+        '.txt',   # Text
+    }
     @classmethod
     def get_timestamp(cls) -> str:
         return datetime.now().strftime(cls.TIMESTAMP_FORMAT)
         return cls.DEFAULT_OUTPUT_DIR / f"repo_clone_{timestamp}"
     @classmethod
+    def get_output_file(cls, timestamp: str) -> Path:
+        return cls.DEFAULT_OUTPUT_DIR / f"scan_result_{timestamp}.md"

core/file_scanner.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import List, Dict, Optional
 from dataclasses import dataclass
 import chardet
@@ -21,15 +21,41 @@ class FileInfo:
             return f"{self.size/(1024*1024):.1f} MB"
 class FileScanner:
-    TARGET_EXTENSIONS = {'.py', '.sh', '.rb', '.js', '.ts', '.java', '.cpp',
-                        '.hpp', '.c', '.h', '.go', '.rs', '.php', '.json',
-                        '.yml', '.yaml', '.toml', '.ini', '.md', '.txt'}
-    EXCLUDED_DIRS = {'.git', '__pycache__', 'node_modules', 'venv', '.env'}
-    MAX_FILE_SIZE = 1 * 1024 * 1024
-    def __init__(self, base_dir: Path):
         self.base_dir = base_dir
     def scan_files(self) -> List[FileInfo]:
         if not self.base_dir.exists():
@@ -38,19 +64,10 @@ class FileScanner:
         files = []
         for entry in self.base_dir.glob("**/*"):
-            if (entry.is_file() and
-                entry.suffix.lower() in self.TARGET_EXTENSIONS and
-                not any(excluded in entry.parts for excluded in self.EXCLUDED_DIRS) and
-                entry.stat().st_size <= self.MAX_FILE_SIZE):
-                try:
-                    with entry.open('rb') as f:
-                        raw_data = f.read(4096)
-                        encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
-                    with entry.open('r', encoding=encoding) as f:
-                        content = f.read()
                     files.append(FileInfo(
                         path=entry.absolute(),
                         size=entry.stat().st_size,
@@ -58,7 +75,5 @@ class FileScanner:
                         content=content,
                         encoding=encoding
                     ))
-                except:
-                    continue
         return sorted(files, key=lambda x: str(x.path))

 from pathlib import Path
+from typing import List, Dict, Optional, Set
 from dataclasses import dataclass
 import chardet
             return f"{self.size/(1024*1024):.1f} MB"
 class FileScanner:
+    # スキャン対象から除外するディレクトリ
+    EXCLUDED_DIRS = {
+        '.git', '__pycache__', 'node_modules', 'venv', '.env',
+        'build', 'dist', 'target', 'bin', 'obj'
+    }
+    def __init__(self, base_dir: Path, target_extensions: Set[str]):
         self.base_dir = base_dir
+        self.target_extensions = target_extensions
+    def _should_scan_file(self, path: Path) -> bool:
+        if any(excluded in path.parts for excluded in self.EXCLUDED_DIRS):
+            return False
+        return path.suffix.lower() in self.target_extensions
+    def _read_file_content(self, file_path: Path) -> tuple[Optional[str], Optional[str]]:
+        try:
+            with file_path.open('rb') as f:
+                raw_data = f.read(4096)
+                result = chardet.detect(raw_data)
+            encoding = result['encoding'] if result['confidence'] > 0.7 else 'utf-8'
+            try:
+                with file_path.open('r', encoding=encoding) as f:
+                    return f.read(), encoding
+            except UnicodeDecodeError:
+                try:
+                    with file_path.open('r', encoding='cp932') as f:
+                        return f.read(), 'cp932'
+                except UnicodeDecodeError:
+                    return None, None
+        except (OSError, ValueError):
+            return None, None
     def scan_files(self) -> List[FileInfo]:
         if not self.base_dir.exists():
         files = []
         for entry in self.base_dir.glob("**/*"):
+            if entry.is_file() and self._should_scan_file(entry):
+                content, encoding = self._read_file_content(entry)
+                if content is not None:
                     files.append(FileInfo(
                         path=entry.absolute(),
                         size=entry.stat().st_size,
                         content=content,
                         encoding=encoding
                     ))
         return sorted(files, key=lambda x: str(x.path))

scan.sh CHANGED Viewed

@@ -1,49 +1,8 @@
 #!/bin/bash
-# エラーが発生した場合に停止
-set -e
-# デフォルトのターゲットパスを設定
-# ここを変更することで対象を変更できます
-TARGET_PATH="https://github.com/DeL-TaiseiOzaki/idebate_scraping.git"  # 例: Linuxカーネル
-# TARGET_PATH="/path/to/your/directory"  # ローカルディレクトリの例
-# 必要なディレクトリの存在確認
-if [ ! -d "output" ]; then
-    mkdir output
-fi
-# Pythonの存在確認
-if ! command -v python3 &> /dev/null; then
-    echo "Error: Python3 is not installed"
     exit 1
 fi
-# GitHubリポジトリの場合、Gitの存在確認
-if [[ $TARGET_PATH == http* ]] && [[ $TARGET_PATH == *github.com* ]]; then
-    if ! command -v git &> /dev/null; then
-        echo "Error: Git is not installed"
-        exit 1
-    fi
-    echo "Scanning GitHub repository: $TARGET_PATH"
-else
-    if [ ! -d "$TARGET_PATH" ]; then
-        echo "Error: Directory not found: $TARGET_PATH"
-        exit 1
-    fi
-    echo "Scanning local directory: $TARGET_PATH"
-fi
-# スキャンの実行
-echo "Starting directory scan..."
-python3 main.py "$TARGET_PATH"
-exit_code=$?
-if [ $exit_code -eq 0 ]; then
-    echo "Scan completed successfully!"
-    echo "Results are saved in the 'output' directory"
-else
-    echo "Scan failed with exit code: $exit_code"
-    exit $exit_code
-fi

 #!/bin/bash
+if [ $# -ne 1 ]; then
+    echo "Usage: ./scan.sh <github_url or directory_path>"
     exit 1
 fi
+target_path="$1"
+python main.py "$target_path"

services/llm_service.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from typing import Optional, List, Dict
 import anthropic
 from dataclasses import dataclass
-from config.llm_settings import LLMSettings
 from core.file_scanner import FileInfo
 @dataclass
@@ -12,15 +11,10 @@ class Message:
 class LLMService:
     MAX_TURNS = 5
-    def __init__(self):
-        self.settings = LLMSettings()
-        self.claude_client = anthropic.Anthropic(api_key=self.settings.anthropic_api_key)
         self.conversation_history: List[Message] = []
-    def switch_model(self, model: str):
-        if model.lower() != "claude":
-            raise ValueError("Only Claude model is available")
     def create_prompt(self, content: str, query: str) -> str:
         return f"""以下はGitHubリポジトリのコード解析結果です。このコードについて質問に答えてください。
@@ -30,36 +24,22 @@ class LLMService:
 質問: {query}
 できるだけ具体的に、コードの内容を参照しながら回答してください。"""
     def _add_to_history(self, role: str, content: str):
         self.conversation_history.append(Message(role=role, content=content))
         if len(self.conversation_history) > self.MAX_TURNS * 2:
             self.conversation_history = self.conversation_history[-self.MAX_TURNS * 2:]
-    def _format_messages_for_claude(self) -> List[Dict[str, str]]:
-        return [{"role": msg.role, "content": msg.content}
-                for msg in self.conversation_history]
-    def get_conversation_history(self) -> List[Dict[str, str]]:
-        return [{"role": msg.role, "content": msg.content}
-                for msg in self.conversation_history]
-    def clear_history(self):
-        self.conversation_history = []
     def get_response(self, content: str, query: str) -> tuple[Optional[str], Optional[str]]:
-        """LLMを使用して回答を生成"""
         try:
             prompt = self.create_prompt(content, query)
             self._add_to_history("user", prompt)
-            # Claudeへのリクエストを修正
-            response = self.claude_client.messages.create(
-                model="claude-3-sonnet-20240229",
-                max_tokens=1024,
-                messages=[
-                    {"role": "user", "content": prompt}
-                ]
             )
             answer = response.content[0].text
@@ -68,7 +48,10 @@ class LLMService:
         except Exception as e:
             return None, f"エラーが発生しました: {str(e)}"
     @staticmethod
     def format_code_content(files: List[FileInfo]) -> str:
         formatted_content = []
@@ -76,4 +59,4 @@ class LLMService:
             formatted_content.append(
                 f"#ファイルパス\n{file_info.path}\n------------\n{file_info.content}\n"
             )
-        return "\n".join(formatted_content)

 import anthropic
 from dataclasses import dataclass
+from typing import List, Optional, Dict
 from core.file_scanner import FileInfo
 @dataclass
 class LLMService:
     MAX_TURNS = 5
+    def __init__(self, api_key: str):
+        self.client = anthropic.Anthropic(api_key=api_key)
         self.conversation_history: List[Message] = []
     def create_prompt(self, content: str, query: str) -> str:
         return f"""以下はGitHubリポジトリのコード解析結果です。このコードについて質問に答えてください。
 質問: {query}
 できるだけ具体的に、コードの内容を参照しながら回答してください。"""
     def _add_to_history(self, role: str, content: str):
         self.conversation_history.append(Message(role=role, content=content))
         if len(self.conversation_history) > self.MAX_TURNS * 2:
             self.conversation_history = self.conversation_history[-self.MAX_TURNS * 2:]
     def get_response(self, content: str, query: str) -> tuple[Optional[str], Optional[str]]:
         try:
             prompt = self.create_prompt(content, query)
             self._add_to_history("user", prompt)
+            response = self.client.messages.create(
+                model="claude-3-5-sonnet-latest",
+                messages=[{"role": msg.role, "content": msg.content}
+                         for msg in self.conversation_history],
+                max_tokens=1024
             )
             answer = response.content[0].text
         except Exception as e:
             return None, f"エラーが発生しました: {str(e)}"
+    def clear_history(self):
+        self.conversation_history = []
     @staticmethod
     def format_code_content(files: List[FileInfo]) -> str:
         formatted_content = []
             formatted_content.append(
                 f"#ファイルパス\n{file_info.path}\n------------\n{file_info.content}\n"
             )
+        return "\n".join(formatted_content)