JumaRubea commited on
Commit
097199d
·
verified ·
1 Parent(s): 37d2f2c

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +85 -0
  2. me.txt +32 -0
  3. rag_components.py +141 -0
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from rag_components import load_documents, split_documents, create_embeddings, setup_vector_store, create_qa_chain
3
+ import os
4
+
5
+ # Ensure cache directories exist
6
+ cache_dirs = ["/tmp/huggingface_cache", "/tmp/transformers_cache", "/tmp/hf_hub_cache", "/tmp/sentence_transformers_cache"]
7
+ for cache_dir in cache_dirs:
8
+ os.makedirs(cache_dir, exist_ok=True)
9
+
10
+ st.set_page_config(
11
+ page_title="Document Chatbot",
12
+ page_icon="📚",
13
+ layout="wide"
14
+ )
15
+ st.title("📚 Chat with your Documents")
16
+
17
+ @st.cache_resource
18
+ def initialize_rag_components(file_path="me.txt"):
19
+ """Initializes and caches RAG components with better error handling."""
20
+ try:
21
+ if not os.path.exists(file_path):
22
+ st.error(f"Error: Document file not found at {file_path}")
23
+ return None, None
24
+
25
+ with st.spinner("Loading documents..."):
26
+ documents = load_documents(file_path)
27
+
28
+ with st.spinner("Splitting documents into chunks..."):
29
+ docs = split_documents(documents)
30
+
31
+ with st.spinner("Creating embeddings (this may take a while)..."):
32
+ embeddings = create_embeddings()
33
+
34
+ with st.spinner("Setting up vector store..."):
35
+ retriever = setup_vector_store(docs, embeddings)
36
+
37
+ with st.spinner("Initializing QA chain..."):
38
+ qa_chain = create_qa_chain(retriever)
39
+
40
+ st.success("✅ RAG system initialized successfully!")
41
+ return qa_chain, retriever
42
+
43
+ except Exception as e:
44
+ st.error(f"❌ Error initializing RAG components: {e}")
45
+ st.info("💡 This might be due to model download issues. Please try refreshing the page.")
46
+ return None, None
47
+
48
+ qa_chain, retriever = initialize_rag_components()
49
+
50
+ if qa_chain is not None:
51
+ # Initialize chat history
52
+ if "messages" not in st.session_state:
53
+ st.session_state.messages = []
54
+
55
+ # Display chat messages from history on app rerun
56
+ for message in st.session_state.messages:
57
+ with st.chat_message(message["role"]):
58
+ st.markdown(message["content"])
59
+
60
+ # React to user input
61
+ if prompt := st.chat_input("Ask me a question about the document"):
62
+ # Display user message in chat message container
63
+ st.session_state.messages.append({"role": "user", "content": prompt})
64
+ with st.chat_message("user"):
65
+ st.markdown(prompt)
66
+
67
+ # Display assistant response in chat message container
68
+ with st.chat_message("assistant"):
69
+ message_placeholder = st.empty()
70
+ full_response = ""
71
+ try:
72
+ # Assuming qa_chain.stream() yields dictionaries with a 'result' key
73
+ for chunk in qa_chain.stream(prompt):
74
+ if 'result' in chunk:
75
+ full_response += chunk['result']
76
+ message_placeholder.markdown(full_response + "▌")
77
+ message_placeholder.markdown(full_response)
78
+ except Exception as e:
79
+ st.error(f"An error occurred: {e}")
80
+ full_response = "Sorry, I could not process your request."
81
+
82
+ # Add assistant response to chat history
83
+ st.session_state.messages.append({"role": "assistant", "content": full_response})
84
+ else:
85
+ st.warning("RAG components could not be initialized. Please check the document file path.")
me.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # About Me
2
+
3
+ My name is Juma Rubea. I am passionate about artificial intelligence, software development, and data science.
4
+ I currently live in Dar es Salaam, Tanzania, and work as a Junior Data Scientist.
5
+
6
+ # Skills and Expertise
7
+ - Programming Languages: Python, AI, ML, Data Science
8
+ - AI/ML Tools: LangChain, Hugging Face Transformers, PyTorch, TensorFlow
9
+ - Databases: PostgreSQL, MongoDB, Chroma, FAISS
10
+ - Cloud & DevOps: AWS, Docker, Kubernetes
11
+
12
+ # Education
13
+ I studied [Your Degree, e.g., Computer Science] at [Your University].
14
+ I have taken specialized courses in machine learning, natural language processing, and cloud computing.
15
+
16
+ # Professional Experience
17
+ - Data Science at SkyConnect 2 years
18
+ - Worked on computer vision
19
+ - Built Sevia using MaskRCNN, DeepLab3v etc.
20
+
21
+ # Projects
22
+ - Chatbot Development: Created a chatbot using LangChain and Hugging Face.
23
+ - RAG Systems: Implemented retrieval-augmented generation pipelines with TinyLlama.
24
+ - Data Engineering: Built data pipelines for structured and unstructured data.
25
+
26
+ # Hobbies & Interests
27
+ In my free time, I enjoy reading tech blogs, playing chess, traveling, open-source contributions, swimming.
28
+
29
+ # Contact
30
+ - Email: rubeajuma8@gmail.com
31
+ - GitHub: github.jumarubea.com
32
+ - LinkedIn: link.jumarubea.com
rag_components.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from langchain.vectorstores import Chroma
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.document_loaders import TextLoader
7
+ from langchain_huggingface import HuggingFacePipeline
8
+ from langchain.chains import RetrievalQA
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
10
+
11
+ # Set cache directories for HuggingFace Spaces
12
+ os.environ["HF_HOME"] = "/tmp/huggingface_cache"
13
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
14
+ os.environ["HF_HUB_CACHE"] = "/tmp/hf_hub_cache"
15
+ os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/tmp/sentence_transformers_cache"
16
+
17
+ # Create cache directories if they don't exist
18
+ for cache_dir in ["/tmp/huggingface_cache", "/tmp/transformers_cache", "/tmp/hf_hub_cache", "/tmp/sentence_transformers_cache"]:
19
+ os.makedirs(cache_dir, exist_ok=True)
20
+
21
+ def load_documents(file_path: str):
22
+ """Loads documents from a specified file path."""
23
+ loader = TextLoader(file_path)
24
+ return loader.load()
25
+
26
+ def split_documents(documents, chunk_size=500, chunk_overlap=50):
27
+ """Splits documents into chunks."""
28
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
29
+ return splitter.split_documents(documents)
30
+
31
+ def create_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"):
32
+ """Creates HuggingFace embeddings with proper cache handling."""
33
+ try:
34
+ # Use local cache directory that HF Spaces can write to
35
+ embeddings = HuggingFaceEmbeddings(
36
+ model_name=model_name,
37
+ cache_folder="/tmp/sentence_transformers_cache"
38
+ )
39
+ return embeddings
40
+ except Exception as e:
41
+ print(f"Error creating embeddings with {model_name}: {e}")
42
+ # Fallback to a different model if the primary fails
43
+ try:
44
+ print("Trying fallback model: sentence-transformers/paraphrase-MiniLM-L6-v2")
45
+ embeddings = HuggingFaceEmbeddings(
46
+ model_name="sentence-transformers/paraphrase-MiniLM-L6-v2",
47
+ cache_folder="/tmp/sentence_transformers_cache"
48
+ )
49
+ return embeddings
50
+ except Exception as e2:
51
+ print(f"Fallback model also failed: {e2}")
52
+ raise e2
53
+
54
+ def setup_vector_store(docs, embeddings, persist_directory="./chroma_db"):
55
+ """Sets up and persists the Chroma vector store."""
56
+ db = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)
57
+ return db.as_retriever()
58
+
59
+ def create_qa_chain(retriever, model_name="microsoft/DialoGPT-medium"):
60
+ """Creates the RetrievalQA chain with streaming capabilities.
61
+ Using a smaller, more reliable model for HuggingFace Spaces."""
62
+ try:
63
+ tokenizer = AutoTokenizer.from_pretrained(
64
+ model_name,
65
+ cache_dir="/tmp/transformers_cache",
66
+ trust_remote_code=True
67
+ )
68
+
69
+ # Add padding token if it doesn't exist
70
+ if tokenizer.pad_token is None:
71
+ tokenizer.pad_token = tokenizer.eos_token
72
+
73
+ model = AutoModelForCausalLM.from_pretrained(
74
+ model_name,
75
+ cache_dir="/tmp/transformers_cache",
76
+ device_map="auto",
77
+ trust_remote_code=True,
78
+ torch_dtype="auto" # Let it choose the best dtype
79
+ )
80
+
81
+ pipe = pipeline(
82
+ "text-generation",
83
+ model=model,
84
+ tokenizer=tokenizer,
85
+ max_new_tokens=256, # Reduced for faster generation
86
+ temperature=0.7,
87
+ top_p=0.9,
88
+ pad_token_id=tokenizer.eos_token_id
89
+ )
90
+
91
+ llm = HuggingFacePipeline(pipeline=pipe)
92
+
93
+ qa_chain = RetrievalQA.from_chain_type(
94
+ llm=llm,
95
+ retriever=retriever,
96
+ chain_type="stuff",
97
+ return_source_documents=True
98
+ )
99
+ return qa_chain
100
+
101
+ except Exception as e:
102
+ print(f"Error loading model {model_name}: {e}")
103
+ # Try with an even smaller model as fallback
104
+ try:
105
+ print("Trying fallback model: distilgpt2")
106
+ return create_qa_chain_fallback(retriever)
107
+ except Exception as e2:
108
+ print(f"Fallback model also failed: {e2}")
109
+ raise e2
110
+
111
+ def create_qa_chain_fallback(retriever):
112
+ """Fallback QA chain with a very small model."""
113
+ tokenizer = AutoTokenizer.from_pretrained(
114
+ "distilgpt2",
115
+ cache_dir="/tmp/transformers_cache"
116
+ )
117
+ tokenizer.pad_token = tokenizer.eos_token
118
+
119
+ model = AutoModelForCausalLM.from_pretrained(
120
+ "distilgpt2",
121
+ cache_dir="/tmp/transformers_cache"
122
+ )
123
+
124
+ pipe = pipeline(
125
+ "text-generation",
126
+ model=model,
127
+ tokenizer=tokenizer,
128
+ max_new_tokens=128,
129
+ temperature=0.7,
130
+ pad_token_id=tokenizer.eos_token_id
131
+ )
132
+
133
+ llm = HuggingFacePipeline(pipeline=pipe)
134
+
135
+ qa_chain = RetrievalQA.from_chain_type(
136
+ llm=llm,
137
+ retriever=retriever,
138
+ chain_type="stuff",
139
+ return_source_documents=True
140
+ )
141
+ return qa_chain