Spaces:
Runtime error
Runtime error
| import os | |
| from dotenv import load_dotenv | |
| from langchain.document_loaders import GithubFileLoader | |
| # from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_text_splitters import CharacterTextSplitter | |
| load_dotenv() | |
| #get the GITHUB_ACCESS_TOKEN from the .env file | |
| GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN") | |
| USER = "heaversm" | |
| REPO = "gdrive-docker" | |
| GITHUB_BASE_URL = "https://github.com/" | |
| def get_similar_files(query, db, embeddings): | |
| # embedding_vector = embeddings.embed_query(query) | |
| # docs_and_scores = db.similarity_search_by_vector(embedding_vector, k = 10) | |
| docs_and_scores = db.similarity_search_with_score(query) | |
| return docs_and_scores | |
| def get_hugging_face_model(): | |
| model_name = "mchochlov/codebert-base-cd-ft" | |
| hf = HuggingFaceEmbeddings(model_name=model_name) | |
| return hf | |
| loader = GithubFileLoader( | |
| #repo is USER/REPO | |
| repo=f"{USER}/{REPO}", | |
| access_token=GITHUB_ACCESS_TOKEN, | |
| github_api_url="https://api.github.com", | |
| file_filter=lambda file_path: file_path.endswith( | |
| (".py", ".ts") | |
| ), # load all python and typescript files | |
| ) | |
| documents = loader.load() | |
| text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| docs = text_splitter.split_documents(documents) | |
| embedding_vector = get_hugging_face_model() | |
| db = FAISS.from_documents(docs, embedding_vector) | |
| model_name = "mchochlov/codebert-base-cd-ft" | |
| query = """ | |
| def create_app(): | |
| app = connexion.FlaskApp(__name__, specification_dir="../.openapi") | |
| app.add_api( | |
| API_VERSION, resolver=connexion.resolver.RelativeResolver("provider.app") | |
| ) | |
| """ | |
| results_with_scores = get_similar_files(query, db, embedding_vector) | |
| print ("retrieved!!!") | |
| print(f"Number of results: {len(results_with_scores)}") | |
| # score is a distance score, the lower the better | |
| for doc, score in results_with_scores: | |
| print(f"Metadata: {doc.metadata}, Score: {score}") | |
| top_file_path = results_with_scores[0][0].metadata['path'] | |
| top_file_content = results_with_scores[0][0].page_content | |
| top_file_score = results_with_scores[0][1] | |
| top_file_link = f"{GITHUB_BASE_URL}{USER}/{REPO}/blob/main/{top_file_path}" | |
| print(f"Top file link: {top_file_link}") | |