DouDou commited on
Upload data2/instruction_generation/pipeline.py with huggingface_hub
Browse files
data2/instruction_generation/pipeline.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
instruction_generation unified entry script
|
| 4 |
+
Supports --mode summarize|parse|all
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import asyncio
|
| 10 |
+
import argparse
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
|
| 14 |
+
# Load .env file (before importing logger)
|
| 15 |
+
env_file = Path(__file__).parent / ".env"
|
| 16 |
+
if env_file.exists():
|
| 17 |
+
load_dotenv(env_file)
|
| 18 |
+
elif (Path(__file__).parent.parent / ".env").exists():
|
| 19 |
+
# If not in current directory, try loading from project root
|
| 20 |
+
load_dotenv(Path(__file__).parent.parent / ".env")
|
| 21 |
+
|
| 22 |
+
# Add current directory to path (for importing local modules)
|
| 23 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 24 |
+
# Add domain_code/src to path for reusing util functions
|
| 25 |
+
sys.path.insert(0, str(Path(__file__).parent.parent / "domain_code" / "src"))
|
| 26 |
+
from util import init_logger, logger
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# Import modules
|
| 30 |
+
from summarize_repo_readme import process_all_repos as process_summarize
|
| 31 |
+
from extract_repo_functions import process_all_repos as process_extract
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
async def main():
|
| 35 |
+
"""Main function"""
|
| 36 |
+
parser = argparse.ArgumentParser(
|
| 37 |
+
description="instruction_generation unified entry tool",
|
| 38 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 39 |
+
epilog="""
|
| 40 |
+
Examples:
|
| 41 |
+
# Full pipeline: summarize README first, then parse functions
|
| 42 |
+
python3 pipeline.py --mode all
|
| 43 |
+
|
| 44 |
+
# Summarize README only
|
| 45 |
+
python3 pipeline.py --mode summarize
|
| 46 |
+
|
| 47 |
+
# Parse functions only (requires README_SUMMARY.md to exist)
|
| 48 |
+
python3 pipeline.py --mode parse
|
| 49 |
+
|
| 50 |
+
# Use local vLLM Qwen (default)
|
| 51 |
+
python3 pipeline.py --mode all
|
| 52 |
+
|
| 53 |
+
# Use OpenAI API
|
| 54 |
+
export OPENAI_API_KEY="your-api-key"
|
| 55 |
+
python3 pipeline.py --mode all --base_url https://api.openai.com/v1 --model gpt-4o-mini
|
| 56 |
+
|
| 57 |
+
# Specify repository directory and other parameters
|
| 58 |
+
python3 pipeline.py --mode all --repos_dir /path/to/repos_filtered --max_concurrency 16 --overwrite
|
| 59 |
+
""",
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Common parameters
|
| 63 |
+
parser.add_argument(
|
| 64 |
+
"--repos_dir",
|
| 65 |
+
type=str,
|
| 66 |
+
default=os.getenv("REPOS_DIR", "/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered"),
|
| 67 |
+
help="Repository root directory path (can be read from REPOS_DIR env var)",
|
| 68 |
+
)
|
| 69 |
+
parser.add_argument(
|
| 70 |
+
"--mode",
|
| 71 |
+
type=str,
|
| 72 |
+
choices=["summarize", "parse", "all"],
|
| 73 |
+
default="all",
|
| 74 |
+
help="Execution mode: 'summarize' (README only), 'parse' (functions only), 'all' (full pipeline)",
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# LLM parameters
|
| 78 |
+
parser.add_argument(
|
| 79 |
+
"--base_url",
|
| 80 |
+
type=str,
|
| 81 |
+
default=os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1"),
|
| 82 |
+
help="LLM API base URL (can be read from OPENAI_BASE_URL env var, default: http://localhost:8000/v1)",
|
| 83 |
+
)
|
| 84 |
+
parser.add_argument(
|
| 85 |
+
"--model",
|
| 86 |
+
type=str,
|
| 87 |
+
default=os.getenv("DEFAULT_MODEL", "Qwen3"),
|
| 88 |
+
help="Model name (can be read from DEFAULT_MODEL env var, default: Qwen3)",
|
| 89 |
+
)
|
| 90 |
+
parser.add_argument(
|
| 91 |
+
"--api_key_env",
|
| 92 |
+
type=str,
|
| 93 |
+
default="OPENAI_API_KEY",
|
| 94 |
+
help="API key environment variable name (default: OPENAI_API_KEY)",
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Performance parameters
|
| 98 |
+
parser.add_argument(
|
| 99 |
+
"--max_concurrency",
|
| 100 |
+
type=int,
|
| 101 |
+
default=int(os.getenv("MAX_CONCURRENCY", "8")),
|
| 102 |
+
help="Maximum concurrency (can be read from MAX_CONCURRENCY env var, default: 8)",
|
| 103 |
+
)
|
| 104 |
+
parser.add_argument(
|
| 105 |
+
"--max_file_chars",
|
| 106 |
+
type=int,
|
| 107 |
+
default=int(os.getenv("MAX_FILE_CHARS", "200000")),
|
| 108 |
+
help="Maximum file size (chars, for parse mode only, can be read from MAX_FILE_CHARS env var, default: 200000)",
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# Other parameters
|
| 112 |
+
parser.add_argument(
|
| 113 |
+
"--overwrite",
|
| 114 |
+
action="store_true",
|
| 115 |
+
help="Overwrite existing files",
|
| 116 |
+
)
|
| 117 |
+
parser.add_argument(
|
| 118 |
+
"--log_file",
|
| 119 |
+
type=str,
|
| 120 |
+
default="instruction_generation/workdir/logs/pipeline.log",
|
| 121 |
+
help="Log file path",
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
args = parser.parse_args()
|
| 125 |
+
|
| 126 |
+
# Initialize logger
|
| 127 |
+
init_logger(args.log_file, level="INFO")
|
| 128 |
+
|
| 129 |
+
# Get API key (with debug logging)
|
| 130 |
+
# region agent log
|
| 131 |
+
env_key_before = os.getenv(args.api_key_env)
|
| 132 |
+
debug_log_path = Path(__file__).parent.parent / ".cursor" / "debug.log"
|
| 133 |
+
try:
|
| 134 |
+
with open(debug_log_path, "a", encoding="utf-8") as f:
|
| 135 |
+
import json
|
| 136 |
+
log_entry = {
|
| 137 |
+
"sessionId": "debug-session",
|
| 138 |
+
"runId": "api-key-debug",
|
| 139 |
+
"hypothesisId": "A",
|
| 140 |
+
"location": "pipeline.py:130",
|
| 141 |
+
"message": "API key read from env",
|
| 142 |
+
"data": {
|
| 143 |
+
"env_var_name": args.api_key_env,
|
| 144 |
+
"key_exists": env_key_before is not None,
|
| 145 |
+
"key_length": len(env_key_before) if env_key_before else 0,
|
| 146 |
+
"key_prefix": env_key_before[:20] + "..." if env_key_before and len(env_key_before) > 20 else env_key_before,
|
| 147 |
+
},
|
| 148 |
+
"timestamp": int(__import__("time").time() * 1000)
|
| 149 |
+
}
|
| 150 |
+
f.write(json.dumps(log_entry) + "\n")
|
| 151 |
+
except Exception:
|
| 152 |
+
pass
|
| 153 |
+
# endregion
|
| 154 |
+
|
| 155 |
+
api_key = os.getenv(args.api_key_env, "none")
|
| 156 |
+
|
| 157 |
+
# region agent log
|
| 158 |
+
try:
|
| 159 |
+
with open(debug_log_path, "a", encoding="utf-8") as f:
|
| 160 |
+
log_entry = {
|
| 161 |
+
"sessionId": "debug-session",
|
| 162 |
+
"runId": "api-key-debug",
|
| 163 |
+
"hypothesisId": "A",
|
| 164 |
+
"location": "pipeline.py:150",
|
| 165 |
+
"message": "API key final value",
|
| 166 |
+
"data": {
|
| 167 |
+
"api_key_length": len(api_key) if api_key else 0,
|
| 168 |
+
"api_key_prefix": api_key[:20] + "..." if api_key and len(api_key) > 20 else api_key,
|
| 169 |
+
"api_key_suffix": "..." + api_key[-10:] if api_key and len(api_key) > 10 else api_key,
|
| 170 |
+
"is_default_none": api_key == "none",
|
| 171 |
+
},
|
| 172 |
+
"timestamp": int(__import__("time").time() * 1000)
|
| 173 |
+
}
|
| 174 |
+
f.write(json.dumps(log_entry) + "\n")
|
| 175 |
+
except Exception:
|
| 176 |
+
pass
|
| 177 |
+
# endregion
|
| 178 |
+
|
| 179 |
+
# Check repository directory
|
| 180 |
+
repos_dir = Path(args.repos_dir)
|
| 181 |
+
if not repos_dir.exists():
|
| 182 |
+
logger.error(f"Repository directory does not exist: {repos_dir}")
|
| 183 |
+
sys.exit(1)
|
| 184 |
+
|
| 185 |
+
# Create log directory
|
| 186 |
+
log_file_path = Path(args.log_file)
|
| 187 |
+
log_file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 188 |
+
|
| 189 |
+
logger.info("=" * 80)
|
| 190 |
+
logger.info(f"instruction_generation tool started")
|
| 191 |
+
logger.info("=" * 80)
|
| 192 |
+
logger.info(f"Mode: {args.mode}")
|
| 193 |
+
logger.info(f"Repository directory: {repos_dir}")
|
| 194 |
+
logger.info(f"LLM API: {args.base_url}")
|
| 195 |
+
logger.info(f"Model: {args.model}")
|
| 196 |
+
logger.info(f"Max concurrency: {args.max_concurrency}")
|
| 197 |
+
logger.info(f"Overwrite existing files: {args.overwrite}")
|
| 198 |
+
logger.info("=" * 80)
|
| 199 |
+
|
| 200 |
+
# Execute based on mode
|
| 201 |
+
if args.mode == "summarize":
|
| 202 |
+
# Summarize README only
|
| 203 |
+
logger.info("Starting: README summarization")
|
| 204 |
+
results = await process_summarize(
|
| 205 |
+
repos_dir=repos_dir,
|
| 206 |
+
base_url=args.base_url,
|
| 207 |
+
model=args.model,
|
| 208 |
+
api_key=api_key,
|
| 209 |
+
log_file=str(log_file_path),
|
| 210 |
+
max_concurrency=args.max_concurrency,
|
| 211 |
+
overwrite=args.overwrite,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
logger.info("\n" + "=" * 80)
|
| 215 |
+
logger.info("README summarization complete!")
|
| 216 |
+
logger.info("=" * 80)
|
| 217 |
+
|
| 218 |
+
elif args.mode == "parse":
|
| 219 |
+
# Parse functions only
|
| 220 |
+
logger.info("Starting: Function parsing")
|
| 221 |
+
results = await process_extract(
|
| 222 |
+
repos_dir=repos_dir,
|
| 223 |
+
base_url=args.base_url,
|
| 224 |
+
model=args.model,
|
| 225 |
+
api_key=api_key,
|
| 226 |
+
log_file=str(log_file_path),
|
| 227 |
+
max_file_chars=args.max_file_chars,
|
| 228 |
+
max_concurrency=args.max_concurrency,
|
| 229 |
+
overwrite=args.overwrite,
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
logger.info("\n" + "=" * 80)
|
| 233 |
+
logger.info("Function parsing complete!")
|
| 234 |
+
logger.info("=" * 80)
|
| 235 |
+
|
| 236 |
+
elif args.mode == "all":
|
| 237 |
+
# Full pipeline: summarize README first, then parse functions
|
| 238 |
+
logger.info("Starting: Full pipeline")
|
| 239 |
+
logger.info("\n" + "-" * 80)
|
| 240 |
+
logger.info("Step 1/2: README summarization")
|
| 241 |
+
logger.info("-" * 80)
|
| 242 |
+
|
| 243 |
+
summarize_results = await process_summarize(
|
| 244 |
+
repos_dir=repos_dir,
|
| 245 |
+
base_url=args.base_url,
|
| 246 |
+
model=args.model,
|
| 247 |
+
api_key=api_key,
|
| 248 |
+
log_file=str(log_file_path),
|
| 249 |
+
max_concurrency=args.max_concurrency,
|
| 250 |
+
overwrite=args.overwrite,
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
logger.info("\n" + "-" * 80)
|
| 254 |
+
logger.info("Step 2/2: Function parsing")
|
| 255 |
+
logger.info("-" * 80)
|
| 256 |
+
|
| 257 |
+
# When parsing functions, if README_SUMMARY.md doesn't exist, overwrite=False will auto-skip
|
| 258 |
+
# But if user requests overwrite, it will try to parse even without README_SUMMARY.md (will skip repos without summary)
|
| 259 |
+
parse_results = await process_extract(
|
| 260 |
+
repos_dir=repos_dir,
|
| 261 |
+
base_url=args.base_url,
|
| 262 |
+
model=args.model,
|
| 263 |
+
api_key=api_key,
|
| 264 |
+
log_file=str(log_file_path),
|
| 265 |
+
max_file_chars=args.max_file_chars,
|
| 266 |
+
max_concurrency=args.max_concurrency,
|
| 267 |
+
overwrite=args.overwrite,
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
logger.info("\n" + "=" * 80)
|
| 271 |
+
logger.info("Full pipeline complete!")
|
| 272 |
+
logger.info("=" * 80)
|
| 273 |
+
|
| 274 |
+
results = parse_results
|
| 275 |
+
else:
|
| 276 |
+
logger.error(f"Unknown mode: {args.mode}")
|
| 277 |
+
sys.exit(1)
|
| 278 |
+
|
| 279 |
+
logger.info("\n" + "=" * 80)
|
| 280 |
+
logger.info("All tasks complete!")
|
| 281 |
+
logger.info("=" * 80)
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
if __name__ == "__main__":
|
| 285 |
+
asyncio.run(main())
|