File size: 3,374 Bytes
257dcc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import argparse
import os
import requests
import csv
from urllib.parse import urljoin

from llama_index.core import Document

FILE="repo_files.csv"

def get_github_repo_files(repo_url):
    """
    Fetches all JavaScript files in the 'src' directory of a GitHub repository.

    Args:
        repo_url (str): The URL of the GitHub repository.

    Returns:
        list: A list of dictionaries, each containing file information (filename, source, url).
    """
    try:
        repo_name = repo_url.split("github.com/")[1].rstrip("/")
        api_url = f"https://api.github.com/repos/{repo_name}/contents/src"
        response = requests.get(api_url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        files = response.json()
        js_files = []

        for file in files:
            if file["type"] == "file" and file["name"].endswith(".js"):
                file_info = {
                    "filename": file["name"],
                    "url": file["download_url"],
                }
                file_content_response = requests.get(file["download_url"])
                file_content_response.raise_for_status()
                file_info["source"] = file_content_response.text
                js_files.append(file_info)

        return js_files

    except requests.exceptions.RequestException as e:
        print(f"Error fetching repository files: {e}")
        return []
    except IndexError:
        print("Invalid GitHub repository URL format.")
        return []
    except KeyError:
        print("src directory not found in the repository")
        return []
    except Exception as e:
        print(f"An unexpected error occured: {e}")
        return []

def write_to_csv(data, output_file=FILE):
    """
    Writes file information to a CSV file.

    Args:
        data (list): A list of dictionaries, each containing file information.
        output_file (str): The name of the output CSV file.
    """
    try:
        with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
            fieldnames = ["filename", "source", "url"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for file_info in data:
                writer.writerow(file_info)
        print(f"File information written to {output_file}")
    except Exception as e:
        print(f"Error writing to CSV: {e}")

def create_csv():
    """
    Main function to parse arguments and process the repository.
    """
    parser = argparse.ArgumentParser(description="Fetch JavaScript files from a GitHub repository's src directory.")
    parser.add_argument("repo_url", help="The GitHub repository URL.")

    repo_url = "https://github.com/KilledByAPixel/LittleJS"

    files = get_github_repo_files(repo_url)
    if files:
        write_to_csv(files)


print('READY?')

rows = []
# Load the file as a JSON
with open(FILE, mode="r", encoding="utf-8") as file:
    csv_reader = csv.reader(file)

    for idx, row in enumerate(csv_reader):
        if idx == 0: continue; # Skip header row
        rows.append(row)

# Convert the chunks to Document objects so the LlamaIndex framework can process them.
documents = [Document(text=row[1], metadata={"title": row[0], "url": row[2]}) for row in rows]
print(len(documents))
print(documents[0].metadata)
print(documents[0])