Spaces:
Build error
Build error
| import argparse | |
| import os | |
| import requests | |
| import csv | |
| from urllib.parse import urljoin | |
| from llama_index.core import Document | |
| FILE="repo_files.csv" | |
| def get_github_repo_files(repo_url): | |
| """ | |
| Fetches all JavaScript files in the 'src' directory of a GitHub repository. | |
| Args: | |
| repo_url (str): The URL of the GitHub repository. | |
| Returns: | |
| list: A list of dictionaries, each containing file information (filename, source, url). | |
| """ | |
| try: | |
| repo_name = repo_url.split("github.com/")[1].rstrip("/") | |
| api_url = f"https://api.github.com/repos/{repo_name}/contents/src" | |
| response = requests.get(api_url) | |
| response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) | |
| files = response.json() | |
| js_files = [] | |
| for file in files: | |
| if file["type"] == "file" and file["name"].endswith(".js"): | |
| file_info = { | |
| "filename": file["name"], | |
| "url": file["download_url"], | |
| } | |
| file_content_response = requests.get(file["download_url"]) | |
| file_content_response.raise_for_status() | |
| file_info["source"] = file_content_response.text | |
| js_files.append(file_info) | |
| return js_files | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching repository files: {e}") | |
| return [] | |
| except IndexError: | |
| print("Invalid GitHub repository URL format.") | |
| return [] | |
| except KeyError: | |
| print("src directory not found in the repository") | |
| return [] | |
| except Exception as e: | |
| print(f"An unexpected error occured: {e}") | |
| return [] | |
| def write_to_csv(data, output_file=FILE): | |
| """ | |
| Writes file information to a CSV file. | |
| Args: | |
| data (list): A list of dictionaries, each containing file information. | |
| output_file (str): The name of the output CSV file. | |
| """ | |
| try: | |
| with open(output_file, "w", newline="", encoding="utf-8") as csvfile: | |
| fieldnames = ["filename", "source", "url"] | |
| writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
| writer.writeheader() | |
| for file_info in data: | |
| writer.writerow(file_info) | |
| print(f"File information written to {output_file}") | |
| except Exception as e: | |
| print(f"Error writing to CSV: {e}") | |
| def create_csv(): | |
| """ | |
| Main function to parse arguments and process the repository. | |
| """ | |
| parser = argparse.ArgumentParser(description="Fetch JavaScript files from a GitHub repository's src directory.") | |
| parser.add_argument("repo_url", help="The GitHub repository URL.") | |
| repo_url = "https://github.com/KilledByAPixel/LittleJS" | |
| files = get_github_repo_files(repo_url) | |
| if files: | |
| write_to_csv(files) | |
| print('READY?') | |
| rows = [] | |
| # Load the file as a JSON | |
| with open(FILE, mode="r", encoding="utf-8") as file: | |
| csv_reader = csv.reader(file) | |
| for idx, row in enumerate(csv_reader): | |
| if idx == 0: continue; # Skip header row | |
| rows.append(row) | |
| # Convert the chunks to Document objects so the LlamaIndex framework can process them. | |
| documents = [Document(text=row[1], metadata={"title": row[0], "url": row[2]}) for row in rows] | |
| print(len(documents)) | |
| print(documents[0].metadata) | |
| print(documents[0]) | |