JumaRubea commited on
Commit
ac8c947
·
verified ·
1 Parent(s): 9d3e1f5

Upload 4 files

Browse files
Files changed (4) hide show
  1. src/app.py +59 -0
  2. src/me.txt +32 -0
  3. src/rag_components.py +51 -0
  4. src/requirements.txt +4 -0
src/app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from rag_components import load_documents, split_documents, create_embeddings, setup_vector_store, create_qa_chain
3
+ import os
4
+
5
+ st.set_page_config(page_title="Document Chatbot")
6
+ st.title("Chat with your Documents")
7
+
8
+ @st.cache_resource
9
+ def initialize_rag_components(file_path="me.txt"):
10
+ """Initializes and caches RAG components."""
11
+ if not os.path.exists(file_path):
12
+ st.error(f"Error: Document file not found at {file_path}")
13
+ return None, None
14
+
15
+ documents = load_documents(file_path)
16
+ docs = split_documents(documents)
17
+ embeddings = create_embeddings()
18
+ retriever = setup_vector_store(docs, embeddings)
19
+ qa_chain = create_qa_chain(retriever)
20
+ return qa_chain, retriever
21
+
22
+ qa_chain, retriever = initialize_rag_components()
23
+
24
+ if qa_chain is not None:
25
+ # Initialize chat history
26
+ if "messages" not in st.session_state:
27
+ st.session_state.messages = []
28
+
29
+ # Display chat messages from history on app rerun
30
+ for message in st.session_state.messages:
31
+ with st.chat_message(message["role"]):
32
+ st.markdown(message["content"])
33
+
34
+ # React to user input
35
+ if prompt := st.chat_input("Ask me a question about the document"):
36
+ # Display user message in chat message container
37
+ st.session_state.messages.append({"role": "user", "content": prompt})
38
+ with st.chat_message("user"):
39
+ st.markdown(prompt)
40
+
41
+ # Display assistant response in chat message container
42
+ with st.chat_message("assistant"):
43
+ message_placeholder = st.empty()
44
+ full_response = ""
45
+ try:
46
+ # Assuming qa_chain.stream() yields dictionaries with a 'result' key
47
+ for chunk in qa_chain.stream(prompt):
48
+ if 'result' in chunk:
49
+ full_response += chunk['result']
50
+ message_placeholder.markdown(full_response + "▌")
51
+ message_placeholder.markdown(full_response)
52
+ except Exception as e:
53
+ st.error(f"An error occurred: {e}")
54
+ full_response = "Sorry, I could not process your request."
55
+
56
+ # Add assistant response to chat history
57
+ st.session_state.messages.append({"role": "assistant", "content": full_response})
58
+ else:
59
+ st.warning("RAG components could not be initialized. Please check the document file path.")
src/me.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # About Me
2
+
3
+ My name is Juma Rubea. I am passionate about artificial intelligence, software development, and data science.
4
+ I currently live in Dar es Salaam, Tanzania, and work as a Junior Data Scientist.
5
+
6
+ # Skills and Expertise
7
+ - Programming Languages: Python, AI, ML, Data Science
8
+ - AI/ML Tools: LangChain, Hugging Face Transformers, PyTorch, TensorFlow
9
+ - Databases: PostgreSQL, MongoDB, Chroma, FAISS
10
+ - Cloud & DevOps: AWS, Docker, Kubernetes
11
+
12
+ # Education
13
+ I studied [Your Degree, e.g., Computer Science] at [Your University].
14
+ I have taken specialized courses in machine learning, natural language processing, and cloud computing.
15
+
16
+ # Professional Experience
17
+ - Data Science at SkyConnect 2 years
18
+ - Worked on computer vision
19
+ - Built Sevia using MaskRCNN, DeepLab3v etc.
20
+
21
+ # Projects
22
+ - Chatbot Development: Created a chatbot using LangChain and Hugging Face.
23
+ - RAG Systems: Implemented retrieval-augmented generation pipelines with TinyLlama.
24
+ - Data Engineering: Built data pipelines for structured and unstructured data.
25
+
26
+ # Hobbies & Interests
27
+ In my free time, I enjoy reading tech blogs, playing chess, traveling, open-source contributions, swimming.
28
+
29
+ # Contact
30
+ - Email: rubeajuma8@gmail.com
31
+ - GitHub: github.jumarubea.com
32
+ - LinkedIn: link.jumarubea.com
src/rag_components.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import Chroma
2
+ from langchain_huggingface import HuggingFaceEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.document_loaders import TextLoader
5
+ from langchain_huggingface import HuggingFacePipeline
6
+ from langchain.chains import RetrievalQA
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
8
+
9
+ def load_documents(file_path: str):
10
+ """Loads documents from a specified file path."""
11
+ loader = TextLoader(file_path)
12
+ return loader.load()
13
+
14
+ def split_documents(documents, chunk_size=500, chunk_overlap=50):
15
+ """Splits documents into chunks."""
16
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
17
+ return splitter.split_documents(documents)
18
+
19
+ def create_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"):
20
+ """Creates HuggingFace embeddings."""
21
+ return HuggingFaceEmbeddings(model_name=model_name)
22
+
23
+ def setup_vector_store(docs, embeddings, persist_directory="./chroma_db"):
24
+ """Sets up and persists the Chroma vector store."""
25
+ db = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)
26
+ return db.as_retriever()
27
+
28
+ def create_qa_chain(retriever, model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
29
+ """Creates the RetrievalQA chain with streaming capabilities."""
30
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ model_name,
33
+ device_map="auto",
34
+ )
35
+ pipe = pipeline(
36
+ "text-generation",
37
+ model=model,
38
+ tokenizer=tokenizer,
39
+ max_new_tokens=512,
40
+ temperature=0.7,
41
+ top_p=0.9
42
+ )
43
+ llm = HuggingFacePipeline(pipeline=pipe)
44
+
45
+ qa_chain = RetrievalQA.from_chain_type(
46
+ llm=llm,
47
+ retriever=retriever,
48
+ chain_type="stuff",
49
+ return_source_documents=True # Added to potentially help with streaming or understanding context
50
+ )
51
+ return qa_chain
src/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ langchain==0.3.27
2
+ langchain_huggingface==0.3.1
3
+ streamlit==1.49.1
4
+ transformers==4.56.1