AI_project / fetch_repo.py
Eoin McGrath
initial commit
257dcc1
import argparse
import os
import requests
import csv
from urllib.parse import urljoin
from llama_index.core import Document
FILE="repo_files.csv"
def get_github_repo_files(repo_url):
"""
Fetches all JavaScript files in the 'src' directory of a GitHub repository.
Args:
repo_url (str): The URL of the GitHub repository.
Returns:
list: A list of dictionaries, each containing file information (filename, source, url).
"""
try:
repo_name = repo_url.split("github.com/")[1].rstrip("/")
api_url = f"https://api.github.com/repos/{repo_name}/contents/src"
response = requests.get(api_url)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
files = response.json()
js_files = []
for file in files:
if file["type"] == "file" and file["name"].endswith(".js"):
file_info = {
"filename": file["name"],
"url": file["download_url"],
}
file_content_response = requests.get(file["download_url"])
file_content_response.raise_for_status()
file_info["source"] = file_content_response.text
js_files.append(file_info)
return js_files
except requests.exceptions.RequestException as e:
print(f"Error fetching repository files: {e}")
return []
except IndexError:
print("Invalid GitHub repository URL format.")
return []
except KeyError:
print("src directory not found in the repository")
return []
except Exception as e:
print(f"An unexpected error occured: {e}")
return []
def write_to_csv(data, output_file=FILE):
"""
Writes file information to a CSV file.
Args:
data (list): A list of dictionaries, each containing file information.
output_file (str): The name of the output CSV file.
"""
try:
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
fieldnames = ["filename", "source", "url"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for file_info in data:
writer.writerow(file_info)
print(f"File information written to {output_file}")
except Exception as e:
print(f"Error writing to CSV: {e}")
def create_csv():
"""
Main function to parse arguments and process the repository.
"""
parser = argparse.ArgumentParser(description="Fetch JavaScript files from a GitHub repository's src directory.")
parser.add_argument("repo_url", help="The GitHub repository URL.")
repo_url = "https://github.com/KilledByAPixel/LittleJS"
files = get_github_repo_files(repo_url)
if files:
write_to_csv(files)
print('READY?')
rows = []
# Load the file as a JSON
with open(FILE, mode="r", encoding="utf-8") as file:
csv_reader = csv.reader(file)
for idx, row in enumerate(csv_reader):
if idx == 0: continue; # Skip header row
rows.append(row)
# Convert the chunks to Document objects so the LlamaIndex framework can process them.
documents = [Document(text=row[1], metadata={"title": row[0], "url": row[2]}) for row in rows]
print(len(documents))
print(documents[0].metadata)
print(documents[0])