Spaces:

boryasbora
/

chatbot_ohw_projects

Sleeping

App Files Files Community

chatbot_ohw_projects / scrape_github.py

boryasbora

Create scrape_github.py

ee03791 verified over 1 year ago

raw

history blame contribute delete

2.92 kB

	import requests
	import os
	from typing import Iterable

	# Define the GitHub API endpoint for the organization's repositories
	org_name = "oceanhackweek"
	url = f"https://api.github.com/orgs/oceanhackweek/repos"

	# Set your personal access token here if needed
	access_token = os.getenv('git_token')
	headers = {
	"Accept": "application/vnd.github.v3+json",
	"Authorization": f"token {access_token}" # Comment out this line if not using an access token
	}

	## Create a directory to store the README files
	if not os.path.exists('readmes_proj'):
	os.makedirs('readmes_proj')

	# Dictionary to store the mapping of filename to repository link
	repo_links = {}

	def download_readme(repo_name, repo_html_url):
	# Construct the URL for the README file in the repository
	readme_url = f"https://raw.githubusercontent.com/{org_name}/{repo_name}/main/README.md"

	try:
	response = requests.get(readme_url)
	response.raise_for_status() # Raise an error for bad responses
	file_name = f"{repo_name}_README.md"
	file_path = os.path.join('readmes_proj', file_name)

	with open(file_path, 'w', encoding='utf-8') as file:
	file.write(response.text)

	# Save the repo link in the dictionary
	repo_links[file_name] = repo_html_url

	print(f"Downloaded: {repo_name}")
	except requests.exceptions.HTTPError as e:
	print(f"Failed to download {repo_name}: {e}")

	def get_repositories(url):
	repos = []
	while url:
	response = requests.get(url, headers=headers)
	response.raise_for_status()
	repos.extend(response.json())
	# Check if there is a next page
	url = response.links.get('next', {}).get('url')
	return repos
	repos = get_repositories(url)
	for repo in repos:
	repo_name = repo["name"]
	repo_html_url = repo["html_url"]
	if "proj" in repo_name:
	download_readme(repo_name,repo_html_url)

	def load_md_to_langchain_document(readme_dict, filename):
	# Load the markdown content from a file
	with open(f'./readmes_proj/{filename}', 'r', encoding='utf-8') as file:
	markdown_content = file.read()
	corrected_content = re.sub(r'(\[.?\]\(.?\\)', r'\1 ', markdown_content)

	# Create a LangChain Document
	langchain_document = Document(
	page_content=corrected_content,
	metadata={"source": readme_dict[filename]}
	)

	return langchain_document

	# Example usage
	documents = []
	for filename in repo_links:
	langchain_doc = load_md_to_langchain_document(repo_links, filename)
	documents.append(langchain_doc)

	def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
	'''
	save langchain documents as json file
	'''
	with open(file_path, 'w') as jsonl_file:
	for doc in array:
	jsonl_file.write(doc.json() + '\n')
	save_docs_to_jsonl(documents,'project_readmes.json')