import streamlit as st import os from github import Github from langchain_community.vectorstores import Chroma from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter from openai import OpenAI from dotenv import load_dotenv # Load environment variables load_dotenv() openai_api_key = os.getenv("OPENAI_API_KEY") # Function to fetch repository data from GitHub def fetch_github_repo_data(repo_name, github_token): """Fetch all text content from a GitHub repository.""" try: g = Github(github_token) repo = g.get_repo(repo_name) contents = repo.get_contents("") repo_data = "" while contents: file_content = contents.pop(0) if file_content.type == "dir": contents.extend(repo.get_contents(file_content.path)) else: try: file_data = repo.get_contents(file_content.path).decoded_content text = file_data.decode("utf-8") repo_data += f"\n\nFile: {file_content.path}\n{text}" except UnicodeDecodeError: # Skip non-text files continue return repo_data except Exception as e: st.error(f"Error fetching GitHub repository data: {e}") return None # Function to generate a response using OpenAI def generate_response(context, question): """Generate a response using OpenAI.""" try: from openai import OpenAI client = OpenAI(api_key=openai_api_key) messages = [ {"role": "system", "content": "You are an assistant that answers questions based on repository content."}, {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"} ] response = client.chat.completions.create( model="gpt-4o-mini", messages=messages, max_tokens=150, ) return response.choices[0].message.content.strip() except Exception as e: st.error(f"Error generating response: {e}") return None # Function to perform RAG using OpenAI and Chroma def perform_rag(repo_data, question): """Perform retrieval-augmented generation using ChromaDB and OpenAI.""" try: if not repo_data: st.warning("Repository data is empty.") return None # Create embeddings embeddings = HuggingFaceEmbeddings() # Split text into chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=20, length_function=len ) chunks = text_splitter.create_documents([repo_data]) # Store chunks in ChromaDB persist_directory = "github_repo_embeddings" vectordb = Chroma.from_documents( documents=chunks, embedding=embeddings, persist_directory=persist_directory ) vectordb.persist() # Load persisted Chroma database vectordb = Chroma( persist_directory=persist_directory, embedding_function=embeddings ) # Perform retrieval using Chroma docs = vectordb.similarity_search(question) if not docs: st.warning("No relevant documents found.") return None context = docs[0].page_content return generate_response(context, question) except Exception as e: st.error(f"Error performing RAG: {e}") return None # Streamlit application def main(): st.title("Chat with GitHub Repository") st.caption("This app allows you to interact with a GitHub repository using OpenAI and ChromaDB.") # Get user inputs github_token = st.text_input("Enter your GitHub Token", type="password") git_repo = st.text_input("Enter the GitHub Repo (owner/repo)") if github_token and git_repo: repo_data = fetch_github_repo_data(git_repo, github_token) if repo_data: st.success(f"Successfully added {git_repo} to the knowledge base!") question = st.text_input("Ask any question about the repository") if question: answer = perform_rag(repo_data, question) if answer: st.subheader("Generated Answer:") st.write(answer) else: st.error("Failed to fetch repository data. Ensure the repository name and token are correct.") if __name__ == "__main__": main()