Spaces:

nivakaran
/

greenStep

Sleeping

App Files Files Community

sliitguy commited on Oct 14, 2025

Commit

f0eeb06

0 Parent(s):

updated initial files

Browse files

Files changed (10) hide show

.gitattributes +1 -0
.gitignore +21 -0
.python-version +1 -0
Dockerfile +17 -0
README.md +34 -0
app.py +442 -0
main.py +6 -0
pyproject.toml +23 -0
requirements.txt +15 -0
uv.lock +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,21 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+.env
+__pycache__/
+*.pyc
+.env
+.venv/
+venv/
+greenstep_education.db/
+*.log
+.DS_Store

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+# Use Python 3.10 as base image
+FROM python:3.10.9
+# Set working directory
+WORKDIR /app
+# Copy all files to container
+COPY . /app
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Expose port 7860 (Hugging Face Spaces default port)
+EXPOSE 7860
+# Run the FastAPI app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,34 @@

+---
+title: GreenStep Education Chatbot
+emoji: 🌱
+colorFrom: green
+colorTo: blue
+sdk: docker
+app_port: 7860
+pinned: false
+license: mit
+---
+# GreenStep Education Assistant
+An AI-powered chatbot for reforestation education, providing information about tree planting, forest conservation, and environmental sustainability.
+## Features
+- RAG-based question answering using PDF educational content
+- MongoDB chat history persistence
+- Multi-method PDF text extraction
+- Conversational AI with context awareness
+## API Endpoints
+- `POST /ask` - Ask questions about reforestation
+- `POST /history` - Get chat history for a session
+- `DELETE /history/{session_id}` - Clear session history
+- `GET /health` - Health check endpoint
+- `GET /` - API information
+## Environment Variables Required
+Set these in your Space's Settings:
+- `HF_TOKEN` - Hugging Face API token
+- `GROQ_API_KEY` - Groq API key
+- `MONGODB_URL` - MongoDB connection string
+- `PDF_PATH` - Path to educational PDF (default: ./reforestation_content.pdf)

app.py ADDED Viewed

	@@ -0,0 +1,442 @@

+import os
+import io
+import json
+import re
+import logging
+import tempfile
+import base64
+from uuid import uuid4
+from typing import Optional, List
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from dotenv import load_dotenv
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_mongodb.chat_message_histories import MongoDBChatMessageHistory
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.documents import Document
+from langchain_groq import ChatGroq
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_chroma import Chroma
+from pymongo import MongoClient
+# Alternative PDF libraries for fallback
+try:
+    from pypdf import PdfReader
+    PYPDF_AVAILABLE = True
+except ImportError:
+    PYPDF_AVAILABLE = False
+try:
+    import fitz  # PyMuPDF
+    PYMUPDF_AVAILABLE = True
+except ImportError:
+    PYMUPDF_AVAILABLE = False
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Load environment variables
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+MONGODB_URL = os.getenv("MONGODB_URL")
+MONGODB_DATABASE = os.getenv("MONGODB_DATABASE", "greenstep_education")
+MONGODB_COLLECTION = os.getenv("MONGODB_COLLECTION", "chat_history")
+HOST = os.getenv("HOST", "0.0.0.0")
+PORT = int(os.getenv("PORT", 5000))
+PDF_PATH = os.getenv("PDF_PATH", "./reforestation_content.pdf")
+# Validate environment variables
+if not all([HF_TOKEN, GROQ_API_KEY, PDF_PATH, MONGODB_URL]):
+    logger.error("Missing required environment variables")
+    raise RuntimeError("Environment variables not set. Please check HF_TOKEN, GROQ_API_KEY, PDF_PATH, and MONGODB_URL")
+# Initialize MongoDB client
+try:
+    mongo_client = MongoClient(MONGODB_URL)
+    mongo_client.admin.command('ping')
+    logger.info("MongoDB connection successful")
+except Exception as e:
+    logger.error(f"Failed to connect to MongoDB: {str(e)}")
+    raise RuntimeError("MongoDB connection failed")
+# Initialize FastAPI app
+app = FastAPI(
+    title="GreenStep Education API",
+    description="Educational chatbot API for GreenStep reforestation app.",
+    version="1.0.0",
+)
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["GET", "POST", "DELETE"],
+    allow_headers=["*"],
+)
+# Initialize RAG components
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+llm = ChatGroq(model_name="openai/gpt-oss-20b")
+def extract_text_with_pypdf(file_path: str) -> List[Document]:
+    """Extract text using pypdf library directly"""
+    try:
+        reader = PdfReader(file_path)
+        documents = []
+        for page_num, page in enumerate(reader.pages):
+            text = page.extract_text()
+            if text.strip():  # Only add non-empty pages
+                doc = Document(
+                    page_content=text,
+                    metadata={"source": file_path, "page": page_num}
+                )
+                documents.append(doc)
+        logger.info(f"pypdf extracted text from {len(documents)} pages")
+        return documents
+    except Exception as e:
+        logger.error(f"pypdf extraction failed: {str(e)}")
+        return []
+def extract_text_with_pymupdf(file_path: str) -> List[Document]:
+    """Extract text using PyMuPDF (fitz) library - often better for complex PDFs"""
+    try:
+        doc = fitz.open(file_path)
+        documents = []
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            text = page.get_text()
+            if text.strip():  # Only add non-empty pages
+                document = Document(
+                    page_content=text,
+                    metadata={"source": file_path, "page": page_num}
+                )
+                documents.append(document)
+        doc.close()
+        logger.info(f"PyMuPDF extracted text from {len(documents)} pages")
+        return documents
+    except Exception as e:
+        logger.error(f"PyMuPDF extraction failed: {str(e)}")
+        return []
+def process_pdf(file_path: str):
+    """Process PDF with multiple fallback methods for robust text extraction"""
+    try:
+        # Check if file exists
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"PDF file not found at: {file_path}")
+        logger.info(f"Processing PDF from: {file_path}")
+        documents = []
+        # Method 1: Try LangChain's PyPDFLoader (uses pypdf internally)
+        try:
+            logger.info("Attempting extraction with PyPDFLoader...")
+            loader = PyPDFLoader(file_path)
+            documents = loader.load()
+            if documents and any(doc.page_content.strip() for doc in documents):
+                logger.info(f"PyPDFLoader successfully loaded {len(documents)} pages")
+            else:
+                documents = []
+                logger.warning("PyPDFLoader returned empty documents")
+        except Exception as e:
+            logger.warning(f"PyPDFLoader failed: {str(e)}")
+        # Method 2: Try direct pypdf if available and previous method failed
+        if not documents and PYPDF_AVAILABLE:
+            logger.info("Attempting extraction with pypdf directly...")
+            documents = extract_text_with_pypdf(file_path)
+        # Method 3: Try PyMuPDF as fallback (often best for complex PDFs)
+        if not documents and PYMUPDF_AVAILABLE:
+            logger.info("Attempting extraction with PyMuPDF (fitz)...")
+            documents = extract_text_with_pymupdf(file_path)
+        # Validate that documents were loaded
+        if not documents:
+            raise ValueError(
+                "Failed to extract text from PDF with all available methods. "
+                "The PDF might be:\n"
+                "1. Empty or corrupted\n"
+                "2. Password-protected\n"
+                "3. Scanned images without OCR (consider using pytesseract)\n"
+                "4. Using unsupported encryption"
+            )
+        # Check if any text was actually extracted
+        total_text = "".join([doc.page_content for doc in documents])
+        if not total_text.strip():
+            raise ValueError("No text content found in PDF. It may contain only images.")
+        logger.info(f"Successfully extracted {len(total_text)} characters from {len(documents)} pages")
+        # Split documents into chunks
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=5000,
+            chunk_overlap=500,
+            length_function=len,
+            separators=["\n\n", "\n", ". ", " ", ""]
+        )
+        splits = text_splitter.split_documents(documents)
+        # Filter out empty chunks
+        splits = [doc for doc in splits if doc.page_content.strip()]
+        if not splits:
+            raise ValueError("Text splitting resulted in zero valid chunks.")
+        logger.info(f"Created {len(splits)} text chunks for vectorization")
+        # Create vectorstore
+        vectorstore = Chroma.from_documents(
+            documents=splits,
+            embedding=embeddings,
+            persist_directory="./greenstep_education.db"
+        )
+        logger.info("Vectorstore created successfully")
+        return vectorstore
+    except FileNotFoundError as e:
+        logger.error(f"File not found: {str(e)}")
+        raise RuntimeError(f"PDF file not found: {str(e)}")
+    except ValueError as e:
+        logger.error(f"Invalid PDF content: {str(e)}")
+        raise RuntimeError(f"PDF processing failed: {str(e)}")
+    except Exception as e:
+        logger.error(f"Unexpected error processing PDF: {str(e)}", exc_info=True)
+        raise RuntimeError(f"PDF processing failed: {str(e)}")
+def get_session_history(session_id: str) -> MongoDBChatMessageHistory:
+    """Get MongoDB chat message history for a session"""
+    return MongoDBChatMessageHistory(
+        connection_string=MONGODB_URL,
+        session_id=session_id,
+        database_name=MONGODB_DATABASE,
+        collection_name=MONGODB_COLLECTION,
+        create_index=True
+    )
+# Initialize vectorstore
+try:
+    logger.info(f"Initializing vectorstore from PDF: {PDF_PATH}")
+    vectorstore = process_pdf(PDF_PATH)
+    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
+    logger.info("Vectorstore initialized successfully")
+except Exception as e:
+    logger.error(f"Vectorstore initialization failed: {str(e)}")
+    logger.error("\nTroubleshooting steps:")
+    logger.error("1. Verify PDF file exists at the specified path")
+    logger.error("2. Ensure PDF contains extractable text (not just scanned images)")
+    logger.error("3. Check if PDF is password-protected")
+    logger.error("4. Try opening the PDF manually to verify it's not corrupted")
+    logger.error("\nInstall additional libraries for better PDF support:")
+    logger.error("  pip install pypdf pymupdf")
+    raise RuntimeError(f"Vectorstore initialization failed: {str(e)}")
+class QuestionRequest(BaseModel):
+    session_id: str
+    question: str
+class QuestionResponse(BaseModel):
+    answer: str
+class SessionHistoryRequest(BaseModel):
+    session_id: str
+class SessionHistoryResponse(BaseModel):
+    session_id: str
+    message_count: int
+    messages: List[dict]
+@app.post(
+    "/ask",
+    response_model=QuestionResponse,
+    summary="Ask the GreenStep education assistant",
+    description="Submit a question to learn about reforestation, trees, forests, and environmental conservation."
+)
+async def ask_question(request: QuestionRequest):
+    """Handle question and maintain chat history in MongoDB"""
+    session_id = request.session_id
+    question = request.question
+    logger.info(f"Received question for session {session_id}: {question}")
+    try:
+        # Get MongoDB chat history
+        history = get_session_history(session_id)
+        all_messages = history.messages
+        last_messages = all_messages[-6:] if len(all_messages) > 6 else all_messages
+        # Contextualize question
+        contextualize_q_prompt = ChatPromptTemplate.from_messages([
+            ("system", "Rephrase the user's question considering the chat history to provide better context."),
+            MessagesPlaceholder("chat_history"),
+            ("human", "{input}")
+        ])
+        history_aware_retriever = create_history_aware_retriever(
+            llm, retriever, contextualize_q_prompt
+        )
+        # System prompt for GreenStep
+        system_prompt = """You are the GreenStep Education Assistant, a friendly, knowledgeable, and inspiring chatbot
+                            designed to educate users about reforestation, tree planting, forest conservation, and environmental
+                            sustainability within the GreenStep app's Education tab.
+                            Your primary mission is to empower users with accurate, actionable knowledge about forests and their
+                            role in combating climate change, while fostering a deep appreciation for nature and encouraging
+                            environmental action.
+                            Use the following verified educational content to answer questions:
+                            {context}
+                            Your responses should be:
+                            1. Educational and engaging about reforestation, tree species, planting techniques, and environmental benefits
+                            2. Scientifically accurate based on the provided educational content
+                            3. Inspiring and action-oriented, motivating users to participate in reforestation
+                            4. Accessible to diverse audiences with clear, jargon-free language
+                            5. Balanced and honest about both opportunities and challenges
+                            6. Interactive and conversational, building on previous discussions
+                            7. Positive and solutions-focused, emphasizing hope and agency
+                            Remember: Transform users from passive learners into informed environmental advocates who
+                            understand reforestation science and feel empowered to contribute through GreenStep.
+                        """
+        qa_prompt = ChatPromptTemplate.from_messages([
+            ("system", system_prompt),
+            MessagesPlaceholder("chat_history"),
+            ("human", "{input}")
+        ])
+        question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
+        rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
+        # Get response
+        result = rag_chain.invoke({
+            "input": question,
+            "chat_history": last_messages
+        })
+        raw_answer = result["answer"]
+        # Remove <think>...</think> blocks
+        cleaned_answer = re.sub(r"<think>.*?</think>\s*", "", raw_answer, flags=re.DOTALL).strip()
+        # Update history
+        history.add_user_message(question)
+        history.add_ai_message(cleaned_answer)
+        logger.info(f"Response saved to MongoDB for session {session_id}")
+        return QuestionResponse(answer=cleaned_answer)
+    except Exception as e:
+        logger.error(f"Error processing question: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
+@app.post("/history", response_model=SessionHistoryResponse)
+async def get_history(request: SessionHistoryRequest):
+    """Retrieve chat history for a session"""
+    try:
+        history = get_session_history(request.session_id)
+        messages = history.messages
+        messages_dict = [{"type": msg.type, "content": msg.content} for msg in messages]
+        return SessionHistoryResponse(
+            session_id=request.session_id,
+            message_count=len(messages),
+            messages=messages_dict
+        )
+    except Exception as e:
+        logger.error(f"Error retrieving history: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Failed to retrieve history: {str(e)}")
+@app.delete("/history/{session_id}")
+async def clear_history(session_id: str):
+    """Clear chat history for a session"""
+    try:
+        history = get_session_history(session_id)
+        history.clear()
+        logger.info(f"Cleared history for session {session_id}")
+        return {"message": f"History cleared for session {session_id}"}
+    except Exception as e:
+        logger.error(f"Error clearing history: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Failed to clear history: {str(e)}")
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    try:
+        mongo_client.admin.command('ping')
+        mongo_status = "connected"
+    except Exception as e:
+        mongo_status = f"disconnected: {str(e)}"
+    return {
+        "status": "healthy",
+        "app": "GreenStep Education Assistant",
+        "mongodb": mongo_status,
+        "vectorstore": "initialized" if vectorstore else "not initialized",
+        "pdf_libraries": {
+            "pypdf": PYPDF_AVAILABLE,
+            "pymupdf": PYMUPDF_AVAILABLE
+        }
+    }
+@app.get("/")
+async def root():
+    return {
+        "message": "Welcome to GreenStep Education API",
+        "description": "Learn about reforestation, tree planting, and environmental conservation",
+        "endpoints": {
+            "ask_question": "/ask",
+            "get_history": "/history",
+            "clear_history": "/history/{session_id}",
+            "health_check": "/health",
+            "documentation": "/docs"
+        }
+    }
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Close MongoDB connection"""
+    mongo_client.close()
+    logger.info("MongoDB connection closed")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host=HOST, port=PORT)

main.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def main():
+    print("Hello from greenstepchatbot!")
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+[project]
+name = "greenstepchatbot"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "dotenv>=0.9.9",
+    "fastapi>=0.119.0",
+    "langchain>=0.3.27",
+    "langchain-chroma>=0.2.6",
+    "langchain-community>=0.3.31",
+    "langchain-core>=0.3.79",
+    "langchain-groq>=0.3.8",
+    "langchain-huggingface>=0.3.1",
+    "langchain-mongodb>=0.7.1",
+    "pillow>=11.3.0",
+    "pydantic>=2.12.1",
+    "pypdf>=6.1.1",
+    "sentence-transformers>=5.1.1",
+    "streamlit>=1.50.0",
+    "uvicorn>=0.37.0",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+fastapi
+uvicorn
+langchain
+langchain_groq
+langchain_core
+langchain_community
+langchain_chroma
+langchain_huggingface
+dotenv
+pydantic
+pillow
+sentence-transformers
+pypdf
+streamlit
+langchain_mongodb

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff