AiCoderv2 commited on
Commit
de9e2bd
Β·
verified Β·
1 Parent(s): 947ec04

Deploy Gradio app with multiple files

Browse files
Files changed (2) hide show
  1. app.py +441 -0
  2. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import os
4
+ import base64
5
+ import json
6
+ import re
7
+ from pathlib import Path
8
+ from typing import List, Dict, Optional, Tuple
9
+ import zipfile
10
+ import io
11
+ from datetime import datetime
12
+ import math
13
+
14
+ from utils import (
15
+ clean_code_content,
16
+ get_file_language,
17
+ estimate_tokens,
18
+ create_chunked_output
19
+ )
20
+ from models import (
21
+ process_github_repo,
22
+ process_huggingface_repo,
23
+ download_repo_as_zip
24
+ )
25
+ from config import (
26
+ SUPPORTED_EXTENSIONS,
27
+ MAX_FILE_SIZE,
28
+ MAX_TOTAL_SIZE,
29
+ CHUNK_SIZE,
30
+ GITHUB_API_BASE,
31
+ HF_API_BASE
32
+ )
33
+
34
+ # CSS for better UI
35
+ css = """
36
+ .container {
37
+ max-width: 1200px;
38
+ margin: 0 auto;
39
+ }
40
+ .progress-bar {
41
+ height: 20px;
42
+ background: linear-gradient(90deg, #4CAF50, #45a049);
43
+ border-radius: 10px;
44
+ transition: width 0.3s ease;
45
+ }
46
+ .file-stats {
47
+ background: #f0f0f0;
48
+ padding: 10px;
49
+ border-radius: 5px;
50
+ margin: 10px 0;
51
+ }
52
+ .warning {
53
+ background: #fff3cd;
54
+ border: 1px solid #ffeaa7;
55
+ padding: 10px;
56
+ border-radius: 5px;
57
+ color: #856404;
58
+ }
59
+ .error {
60
+ background: #f8d7da;
61
+ border: 1px solid #f5c6cb;
62
+ padding: 10px;
63
+ border-radius: 5px;
64
+ color: #721c24;
65
+ }
66
+ .success {
67
+ background: #d4edda;
68
+ border: 1px solid #c3e6cb;
69
+ padding: 10px;
70
+ border-radius: 5px;
71
+ color: #155724;
72
+ }
73
+ """
74
+
75
+ def validate_repo_url(url: str) -> Tuple[str, str]:
76
+ """Validate and determine repository type and owner/name"""
77
+ url = url.strip()
78
+
79
+ # GitHub URL patterns
80
+ github_patterns = [
81
+ r'github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$',
82
+ r'api\.github\.com/repos/([^/]+)/([^/]+)'
83
+ ]
84
+
85
+ # Hugging Face URL patterns
86
+ hf_patterns = [
87
+ r'huggingface\.co/([^/]+)/([^/]+?)(?:\.git)?/?$',
88
+ r'hf\.co/([^/]+)/([^/]+?)(?:\.git)?/?$'
89
+ ]
90
+
91
+ for pattern in github_patterns:
92
+ match = re.search(pattern, url)
93
+ if match:
94
+ return "github", f"{match.group(1)}/{match.group(2)}"
95
+
96
+ for pattern in hf_patterns:
97
+ match = re.search(pattern, url)
98
+ if match:
99
+ return "huggingface", f"{match.group(1)}/{match.group(2)}"
100
+
101
+ raise ValueError("Invalid repository URL. Please provide a valid GitHub or Hugging Face repository URL.")
102
+
103
+ def process_repository(
104
+ repo_url: str,
105
+ token: str = "",
106
+ include_patterns: str = "",
107
+ exclude_patterns: str = "",
108
+ max_file_size_mb: int = 10,
109
+ chunk_size: int = 50000,
110
+ include_metadata: bool = True,
111
+ remove_comments: bool = False,
112
+ progress=gr.Progress()
113
+ ) -> Tuple[str, str, str]:
114
+ """Main function to process repository and generate text file"""
115
+
116
+ try:
117
+ # Validate URL and get repo info
118
+ repo_type, repo_path = validate_repo_url(repo_url)
119
+
120
+ # Parse include/exclude patterns
121
+ include_list = [p.strip() for p in include_patterns.split(",") if p.strip()] if include_patterns else []
122
+ exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()] if exclude_patterns else []
123
+
124
+ progress(0.1, desc="Fetching repository information...")
125
+
126
+ # Process repository based on type
127
+ if repo_type == "github":
128
+ files_data, repo_info = process_github_repo(
129
+ repo_path,
130
+ token,
131
+ include_list,
132
+ exclude_list,
133
+ max_file_size_mb * 1024 * 1024
134
+ )
135
+ else: # huggingface
136
+ files_data, repo_info = process_huggingface_repo(
137
+ repo_path,
138
+ token,
139
+ include_list,
140
+ exclude_list,
141
+ max_file_size_mb * 1024 * 1024
142
+ )
143
+
144
+ if not files_data:
145
+ return "", "⚠️ No files found matching the criteria.", ""
146
+
147
+ progress(0.3, desc="Processing files...")
148
+
149
+ # Generate consolidated text
150
+ total_files = len(files_data)
151
+ processed_files = 0
152
+ total_tokens = 0
153
+ total_chars = 0
154
+
155
+ # Create header
156
+ header_lines = []
157
+ if include_metadata:
158
+ header_lines.append("=" * 80)
159
+ header_lines.append(f"REPOSITORY: {repo_info.get('full_name', repo_path)}")
160
+ header_lines.append(f"DESCRIPTION: {repo_info.get('description', 'No description')}")
161
+ header_lines.append(f"URL: {repo_url}")
162
+ header_lines.append(f"PROCESSED: {datetime.now().isoformat()}")
163
+ header_lines.append(f"TOTAL FILES: {total_files}")
164
+ header_lines.append("=" * 80)
165
+ header_lines.append("")
166
+
167
+ content_parts = ["\n".join(header_lines)]
168
+
169
+ # Process each file
170
+ for i, (file_path, content, file_size) in enumerate(files_data):
171
+ progress(0.3 + (0.5 * i / total_files), desc=f"Processing file {i+1}/{total_files}")
172
+
173
+ # Clean content if requested
174
+ if remove_comments:
175
+ content = clean_code_content(content, file_path)
176
+
177
+ # Add file header
178
+ file_header = f"\n{'-' * 60}\n"
179
+ file_header += f"FILE: {file_path}\n"
180
+ file_header += f"SIZE: {file_size:,} bytes\n"
181
+ file_header += f"LANGUAGE: {get_file_language(file_path)}\n"
182
+ file_header += f"{'-' * 60}\n\n"
183
+
184
+ # Add content
185
+ file_content = file_header + content + "\n\n"
186
+
187
+ # Check if adding this file would exceed chunk size
188
+ if len("\n".join(content_parts + [file_content])) > chunk_size:
189
+ # Save current chunk
190
+ yield "\n".join(content_parts), generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
191
+ # Start new chunk
192
+ content_parts = [file_header + "\n".join(header_lines)]
193
+
194
+ content_parts.append(file_content)
195
+ processed_files += 1
196
+ total_chars += len(content)
197
+ total_tokens += estimate_tokens(content)
198
+
199
+ progress(0.9, desc="Finalizing...")
200
+
201
+ # Final content
202
+ final_content = "\n".join(content_parts)
203
+
204
+ # Add footer
205
+ if include_metadata:
206
+ footer = f"\n{'=' * 80}\n"
207
+ footer += f"SUMMARY:\n"
208
+ footer += f"- Files processed: {processed_files}\n"
209
+ footer += f"- Total characters: {total_chars:,}\n"
210
+ footer += f"- Estimated tokens: {total_tokens:,}\n"
211
+ footer += f"- Repository: {repo_info.get('full_name', repo_path)}\n"
212
+ footer += f"{'=' * 80}\n"
213
+ final_content += footer
214
+
215
+ progress(1.0, desc="Complete!")
216
+
217
+ return final_content, generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
218
+
219
+ except Exception as e:
220
+ error_msg = f"❌ Error: {str(e)}"
221
+ return "", error_msg, "error"
222
+
223
+ def generate_stats(files_processed: int, tokens: int, chars: int, total_files: int) -> str:
224
+ """Generate statistics HTML"""
225
+ stats_html = f"""
226
+ <div class="file-stats">
227
+ <h3>πŸ“Š Processing Statistics</h3>
228
+ <p><strong>Files Processed:</strong> {files_processed:,} / {total_files:,}</p>
229
+ <p><strong>Total Characters:</strong> {chars:,}</p>
230
+ <p><strong>Estimated Tokens:</strong> {tokens:,}</p>
231
+ <p><strong>Average Tokens per File:</strong> {tokens // max(files_processed, 1):,}</p>
232
+ </div>
233
+ """
234
+ return stats_html
235
+
236
+ def download_repo_locally(repo_url: str, token: str = "") -> str:
237
+ """Download repository as ZIP for local processing"""
238
+ try:
239
+ repo_type, repo_path = validate_repo_url(repo_url)
240
+
241
+ if repo_type == "github":
242
+ return download_repo_as_zip(f"github.com/{repo_path}", token)
243
+ else:
244
+ return download_repo_as_zip(f"huggingface.co/{repo_path}", token)
245
+
246
+ except Exception as e:
247
+ return f"Error downloading repository: {str(e)}"
248
+
249
+ # Create Gradio interface
250
+ def create_interface():
251
+ with gr.Blocks(
252
+ title="Repo-to-Text Converter",
253
+ theme=gr.themes.Soft(),
254
+ css=css
255
+ ) as demo:
256
+
257
+ gr.Markdown("""
258
+ # πŸ“š Repository to Text Converter
259
+
260
+ Convert GitHub or Hugging Face repositories into formatted text files perfect for LLM training.
261
+
262
+ **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
263
+ """)
264
+
265
+ with gr.Row():
266
+ with gr.Column(scale=2):
267
+ # Input section
268
+ gr.Markdown("## πŸ“₯ Repository Input")
269
+
270
+ repo_url = gr.Textbox(
271
+ label="Repository URL",
272
+ placeholder="https://github.com/username/repo or https://huggingface.co/username/repo",
273
+ lines=2
274
+ )
275
+
276
+ token = gr.Textbox(
277
+ label="Access Token (Optional)",
278
+ placeholder="GitHub token or Hugging Face token for private repos",
279
+ type="password"
280
+ )
281
+
282
+ with gr.Accordion("πŸ”§ Advanced Options", open=False):
283
+ include_patterns = gr.Textbox(
284
+ label="Include Patterns (comma-separated)",
285
+ placeholder="*.py,*.md,src/**/*.py",
286
+ info="Only include files matching these patterns"
287
+ )
288
+
289
+ exclude_patterns = gr.Textbox(
290
+ label="Exclude Patterns (comma-separated)",
291
+ placeholder="*.git*,*.log,node_modules/**",
292
+ value="*.git*,*.log,node_modules/**,__pycache__/**,.DS_Store"
293
+ )
294
+
295
+ max_file_size = gr.Slider(
296
+ minimum=1,
297
+ maximum=100,
298
+ value=10,
299
+ step=1,
300
+ label="Max File Size (MB)",
301
+ info="Files larger than this will be skipped"
302
+ )
303
+
304
+ chunk_size = gr.Slider(
305
+ minimum=1000,
306
+ maximum=100000,
307
+ value=50000,
308
+ step=1000,
309
+ label="Chunk Size (characters)",
310
+ info="Split output into chunks of this size"
311
+ )
312
+
313
+ include_metadata = gr.Checkbox(
314
+ value=True,
315
+ label="Include Metadata",
316
+ info="Add repository information and statistics"
317
+ )
318
+
319
+ remove_comments = gr.Checkbox(
320
+ value=False,
321
+ label="Remove Comments",
322
+ info="Strip comments from code files (experimental)"
323
+ )
324
+
325
+ process_btn = gr.Button(
326
+ "πŸš€ Process Repository",
327
+ variant="primary",
328
+ size="lg"
329
+ )
330
+
331
+ download_btn = gr.Button(
332
+ "⬇️ Download as ZIP",
333
+ variant="secondary"
334
+ )
335
+
336
+ with gr.Column(scale=1):
337
+ # Info section
338
+ gr.Markdown("## ℹ️ Information")
339
+
340
+ gr.Markdown("""
341
+ ### Supported Platforms:
342
+ - βœ… GitHub (public and private)
343
+ - βœ… Hugging Face (public and private)
344
+
345
+ ### Supported File Types:
346
+ - Code files (.py, .js, .java, .cpp, etc.)
347
+ - Documentation (.md, .txt, .rst)
348
+ - Configuration files (.json, .yaml, .toml)
349
+ - And many more!
350
+
351
+ ### Features:
352
+ - πŸ”„ Chunked output for large repos
353
+ - πŸ“Š Token estimation
354
+ - 🎯 Pattern-based file filtering
355
+ - 🧹 Optional comment removal
356
+ """)
357
+
358
+ # Output section
359
+ gr.Markdown("## πŸ“€ Output")
360
+
361
+ with gr.Row():
362
+ stats_display = gr.HTML(label="Statistics")
363
+
364
+ output_text = gr.Textbox(
365
+ label="Generated Text",
366
+ lines=20,
367
+ max_lines=50,
368
+ show_copy_button=True,
369
+ interactive=True
370
+ )
371
+
372
+ status_display = gr.HTML()
373
+
374
+ # Event handlers
375
+ process_btn.click(
376
+ fn=process_repository,
377
+ inputs=[
378
+ repo_url,
379
+ token,
380
+ include_patterns,
381
+ exclude_patterns,
382
+ max_file_size,
383
+ chunk_size,
384
+ include_metadata,
385
+ remove_comments
386
+ ],
387
+ outputs=[output_text, stats_display, status_display]
388
+ )
389
+
390
+ download_btn.click(
391
+ fn=download_repo_locally,
392
+ inputs=[repo_url, token],
393
+ outputs=gr.File(label="Downloaded Repository")
394
+ )
395
+
396
+ # Examples
397
+ gr.Markdown("## 🎯 Examples")
398
+ gr.Examples(
399
+ examples=[
400
+ [
401
+ "https://github.com/gradio-app/gradio",
402
+ "",
403
+ "*.py,*.md",
404
+ "",
405
+ 10,
406
+ 50000,
407
+ True,
408
+ False
409
+ ],
410
+ [
411
+ "https://huggingface.co/huggingface/transformers",
412
+ "",
413
+ "*.py,*.md,*.rst",
414
+ "tests/**,docs/**",
415
+ 5,
416
+ 30000,
417
+ True,
418
+ False
419
+ ]
420
+ ],
421
+ inputs=[
422
+ repo_url,
423
+ token,
424
+ include_patterns,
425
+ exclude_patterns,
426
+ max_file_size,
427
+ chunk_size,
428
+ include_metadata,
429
+ remove_comments
430
+ ]
431
+ )
432
+
433
+ return demo
434
+
435
+ if __name__ == "__main__":
436
+ demo = create_interface()
437
+ demo.launch(
438
+ share=True,
439
+ show_error=True,
440
+ show_tips=True
441
+ )
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ requirements.txt
2
+ requests
3
+ gradio
4
+ git+https://github.com/huggingface/transformers
5
+ torch
6
+ tokenizers
7
+ accelerate
8
+ sentencepiece
9
+ numpy
10
+ Pillow