Spaces:
Sleeping
Sleeping
| import requests | |
| import os | |
| from typing import Iterable | |
| # Define the GitHub API endpoint for the organization's repositories | |
| org_name = "oceanhackweek" | |
| url = f"https://api.github.com/orgs/oceanhackweek/repos" | |
| # Set your personal access token here if needed | |
| access_token = os.getenv('git_token') | |
| headers = { | |
| "Accept": "application/vnd.github.v3+json", | |
| "Authorization": f"token {access_token}" # Comment out this line if not using an access token | |
| } | |
| ## Create a directory to store the README files | |
| if not os.path.exists('readmes_proj'): | |
| os.makedirs('readmes_proj') | |
| # Dictionary to store the mapping of filename to repository link | |
| repo_links = {} | |
| def download_readme(repo_name, repo_html_url): | |
| # Construct the URL for the README file in the repository | |
| readme_url = f"https://raw.githubusercontent.com/{org_name}/{repo_name}/main/README.md" | |
| try: | |
| response = requests.get(readme_url) | |
| response.raise_for_status() # Raise an error for bad responses | |
| file_name = f"{repo_name}_README.md" | |
| file_path = os.path.join('readmes_proj', file_name) | |
| with open(file_path, 'w', encoding='utf-8') as file: | |
| file.write(response.text) | |
| # Save the repo link in the dictionary | |
| repo_links[file_name] = repo_html_url | |
| print(f"Downloaded: {repo_name}") | |
| except requests.exceptions.HTTPError as e: | |
| print(f"Failed to download {repo_name}: {e}") | |
| def get_repositories(url): | |
| repos = [] | |
| while url: | |
| response = requests.get(url, headers=headers) | |
| response.raise_for_status() | |
| repos.extend(response.json()) | |
| # Check if there is a next page | |
| url = response.links.get('next', {}).get('url') | |
| return repos | |
| repos = get_repositories(url) | |
| for repo in repos: | |
| repo_name = repo["name"] | |
| repo_html_url = repo["html_url"] | |
| if "proj" in repo_name: | |
| download_readme(repo_name,repo_html_url) | |
| def load_md_to_langchain_document(readme_dict, filename): | |
| # Load the markdown content from a file | |
| with open(f'./readmes_proj/{filename}', 'r', encoding='utf-8') as file: | |
| markdown_content = file.read() | |
| corrected_content = re.sub(r'(\[.*?\]\(.*?\\)', r'\1 ', markdown_content) | |
| # Create a LangChain Document | |
| langchain_document = Document( | |
| page_content=corrected_content, | |
| metadata={"source": readme_dict[filename]} | |
| ) | |
| return langchain_document | |
| # Example usage | |
| documents = [] | |
| for filename in repo_links: | |
| langchain_doc = load_md_to_langchain_document(repo_links, filename) | |
| documents.append(langchain_doc) | |
| def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None: | |
| ''' | |
| save langchain documents as json file | |
| ''' | |
| with open(file_path, 'w') as jsonl_file: | |
| for doc in array: | |
| jsonl_file.write(doc.json() + '\n') | |
| save_docs_to_jsonl(documents,'project_readmes.json') | |