Spaces:

JumaRubea
/

chatbot

Sleeping

App Files Files Community

JumaRubea commited on Sep 15, 2025

Commit

097199d

verified ·

1 Parent(s): 37d2f2c

Upload 3 files

Browse files

Files changed (3) hide show

app.py +85 -0
me.txt +32 -0
rag_components.py +141 -0

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import streamlit as st
+from rag_components import load_documents, split_documents, create_embeddings, setup_vector_store, create_qa_chain
+import os
+# Ensure cache directories exist
+cache_dirs = ["/tmp/huggingface_cache", "/tmp/transformers_cache", "/tmp/hf_hub_cache", "/tmp/sentence_transformers_cache"]
+for cache_dir in cache_dirs:
+    os.makedirs(cache_dir, exist_ok=True)
+st.set_page_config(
+    page_title="Document Chatbot",
+    page_icon="📚",
+    layout="wide"
+)
+st.title("📚 Chat with your Documents")
+@st.cache_resource
+def initialize_rag_components(file_path="me.txt"):
+    """Initializes and caches RAG components with better error handling."""
+    try:
+        if not os.path.exists(file_path):
+            st.error(f"Error: Document file not found at {file_path}")
+            return None, None
+        with st.spinner("Loading documents..."):
+            documents = load_documents(file_path)
+        with st.spinner("Splitting documents into chunks..."):
+            docs = split_documents(documents)
+        with st.spinner("Creating embeddings (this may take a while)..."):
+            embeddings = create_embeddings()
+        with st.spinner("Setting up vector store..."):
+            retriever = setup_vector_store(docs, embeddings)
+        with st.spinner("Initializing QA chain..."):
+            qa_chain = create_qa_chain(retriever)
+        st.success("✅ RAG system initialized successfully!")
+        return qa_chain, retriever
+    except Exception as e:
+        st.error(f"❌ Error initializing RAG components: {e}")
+        st.info("💡 This might be due to model download issues. Please try refreshing the page.")
+        return None, None
+qa_chain, retriever = initialize_rag_components()
+if qa_chain is not None:
+    # Initialize chat history
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Display chat messages from history on app rerun
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # React to user input
+    if prompt := st.chat_input("Ask me a question about the document"):
+        # Display user message in chat message container
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # Display assistant response in chat message container
+        with st.chat_message("assistant"):
+            message_placeholder = st.empty()
+            full_response = ""
+            try:
+                # Assuming qa_chain.stream() yields dictionaries with a 'result' key
+                for chunk in qa_chain.stream(prompt):
+                    if 'result' in chunk:
+                        full_response += chunk['result']
+                        message_placeholder.markdown(full_response + "▌")
+                message_placeholder.markdown(full_response)
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
+                full_response = "Sorry, I could not process your request."
+        # Add assistant response to chat history
+        st.session_state.messages.append({"role": "assistant", "content": full_response})
+else:
+    st.warning("RAG components could not be initialized. Please check the document file path.")

me.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+# About Me
+My name is Juma Rubea. I am passionate about  artificial intelligence, software development, and data science.
+I currently live in Dar es Salaam,  Tanzania, and work as a Junior Data Scientist.
+# Skills and Expertise
+- Programming Languages: Python, AI, ML, Data Science
+- AI/ML Tools: LangChain, Hugging Face Transformers, PyTorch, TensorFlow
+- Databases: PostgreSQL, MongoDB, Chroma, FAISS
+- Cloud & DevOps: AWS, Docker, Kubernetes
+# Education
+I studied [Your Degree, e.g., Computer Science] at [Your University].
+I have taken specialized courses in machine learning, natural language processing, and cloud computing.
+# Professional Experience
+- Data Science at SkyConnect 2 years
+  - Worked on computer vision
+  - Built Sevia using MaskRCNN, DeepLab3v etc.
+# Projects
+- Chatbot Development: Created a chatbot using LangChain and Hugging Face.
+- RAG Systems: Implemented retrieval-augmented generation pipelines with TinyLlama.
+- Data Engineering: Built data pipelines for structured and unstructured data.
+# Hobbies & Interests
+In my free time, I enjoy reading tech blogs, playing chess, traveling, open-source contributions, swimming.
+# Contact
+- Email: rubeajuma8@gmail.com
+- GitHub: github.jumarubea.com
+- LinkedIn: link.jumarubea.com

rag_components.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import os
+import tempfile
+from langchain.vectorstores import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.document_loaders import TextLoader
+from langchain_huggingface import HuggingFacePipeline
+from langchain.chains import RetrievalQA
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+# Set cache directories for HuggingFace Spaces
+os.environ["HF_HOME"] = "/tmp/huggingface_cache"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
+os.environ["HF_HUB_CACHE"] = "/tmp/hf_hub_cache"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/tmp/sentence_transformers_cache"
+# Create cache directories if they don't exist
+for cache_dir in ["/tmp/huggingface_cache", "/tmp/transformers_cache", "/tmp/hf_hub_cache", "/tmp/sentence_transformers_cache"]:
+    os.makedirs(cache_dir, exist_ok=True)
+def load_documents(file_path: str):
+    """Loads documents from a specified file path."""
+    loader = TextLoader(file_path)
+    return loader.load()
+def split_documents(documents, chunk_size=500, chunk_overlap=50):
+    """Splits documents into chunks."""
+    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    return splitter.split_documents(documents)
+def create_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"):
+    """Creates HuggingFace embeddings with proper cache handling."""
+    try:
+        # Use local cache directory that HF Spaces can write to
+        embeddings = HuggingFaceEmbeddings(
+            model_name=model_name,
+            cache_folder="/tmp/sentence_transformers_cache"
+        )
+        return embeddings
+    except Exception as e:
+        print(f"Error creating embeddings with {model_name}: {e}")
+        # Fallback to a different model if the primary fails
+        try:
+            print("Trying fallback model: sentence-transformers/paraphrase-MiniLM-L6-v2")
+            embeddings = HuggingFaceEmbeddings(
+                model_name="sentence-transformers/paraphrase-MiniLM-L6-v2",
+                cache_folder="/tmp/sentence_transformers_cache"
+            )
+            return embeddings
+        except Exception as e2:
+            print(f"Fallback model also failed: {e2}")
+            raise e2
+def setup_vector_store(docs, embeddings, persist_directory="./chroma_db"):
+    """Sets up and persists the Chroma vector store."""
+    db = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)
+    return db.as_retriever()
+def create_qa_chain(retriever, model_name="microsoft/DialoGPT-medium"):
+    """Creates the RetrievalQA chain with streaming capabilities.
+    Using a smaller, more reliable model for HuggingFace Spaces."""
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            cache_dir="/tmp/transformers_cache",
+            trust_remote_code=True
+        )
+        # Add padding token if it doesn't exist
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            cache_dir="/tmp/transformers_cache",
+            device_map="auto",
+            trust_remote_code=True,
+            torch_dtype="auto"  # Let it choose the best dtype
+        )
+        pipe = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=256,  # Reduced for faster generation
+            temperature=0.7,
+            top_p=0.9,
+            pad_token_id=tokenizer.eos_token_id
+        )
+        llm = HuggingFacePipeline(pipeline=pipe)
+        qa_chain = RetrievalQA.from_chain_type(
+            llm=llm,
+            retriever=retriever,
+            chain_type="stuff",
+            return_source_documents=True
+        )
+        return qa_chain
+    except Exception as e:
+        print(f"Error loading model {model_name}: {e}")
+        # Try with an even smaller model as fallback
+        try:
+            print("Trying fallback model: distilgpt2")
+            return create_qa_chain_fallback(retriever)
+        except Exception as e2:
+            print(f"Fallback model also failed: {e2}")
+            raise e2
+def create_qa_chain_fallback(retriever):
+    """Fallback QA chain with a very small model."""
+    tokenizer = AutoTokenizer.from_pretrained(
+        "distilgpt2",
+        cache_dir="/tmp/transformers_cache"
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        "distilgpt2",
+        cache_dir="/tmp/transformers_cache"
+    )
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        max_new_tokens=128,
+        temperature=0.7,
+        pad_token_id=tokenizer.eos_token_id
+    )
+    llm = HuggingFacePipeline(pipeline=pipe)
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        retriever=retriever,
+        chain_type="stuff",
+        return_source_documents=True
+    )
+    return qa_chain