File size: 16,405 Bytes
f0eeb06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
import os
import io
import json
import re
import logging
import tempfile
import base64
from uuid import uuid4
from typing import Optional, List
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from dotenv import load_dotenv
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_mongodb.chat_message_histories import MongoDBChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.documents import Document
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from pymongo import MongoClient

# Alternative PDF libraries for fallback
try:
    from pypdf import PdfReader
    PYPDF_AVAILABLE = True
except ImportError:
    PYPDF_AVAILABLE = False
    
try:
    import fitz  # PyMuPDF
    PYMUPDF_AVAILABLE = True
except ImportError:
    PYMUPDF_AVAILABLE = False


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
MONGODB_URL = os.getenv("MONGODB_URL")
MONGODB_DATABASE = os.getenv("MONGODB_DATABASE", "greenstep_education")
MONGODB_COLLECTION = os.getenv("MONGODB_COLLECTION", "chat_history")
HOST = os.getenv("HOST", "0.0.0.0")
PORT = int(os.getenv("PORT", 5000))
PDF_PATH = os.getenv("PDF_PATH", "./reforestation_content.pdf")


# Validate environment variables
if not all([HF_TOKEN, GROQ_API_KEY, PDF_PATH, MONGODB_URL]):
    logger.error("Missing required environment variables")
    raise RuntimeError("Environment variables not set. Please check HF_TOKEN, GROQ_API_KEY, PDF_PATH, and MONGODB_URL")


# Initialize MongoDB client
try:
    mongo_client = MongoClient(MONGODB_URL)
    mongo_client.admin.command('ping')
    logger.info("MongoDB connection successful")
except Exception as e:
    logger.error(f"Failed to connect to MongoDB: {str(e)}")
    raise RuntimeError("MongoDB connection failed")


# Initialize FastAPI app
app = FastAPI(
    title="GreenStep Education API",
    description="Educational chatbot API for GreenStep reforestation app.",
    version="1.0.0",
)


# Configure CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["GET", "POST", "DELETE"],
    allow_headers=["*"],
)


# Initialize RAG components
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
llm = ChatGroq(model_name="openai/gpt-oss-20b")


def extract_text_with_pypdf(file_path: str) -> List[Document]:
    """Extract text using pypdf library directly"""
    try:
        reader = PdfReader(file_path)
        documents = []
        
        for page_num, page in enumerate(reader.pages):
            text = page.extract_text()
            if text.strip():  # Only add non-empty pages
                doc = Document(
                    page_content=text,
                    metadata={"source": file_path, "page": page_num}
                )
                documents.append(doc)
        
        logger.info(f"pypdf extracted text from {len(documents)} pages")
        return documents
    except Exception as e:
        logger.error(f"pypdf extraction failed: {str(e)}")
        return []


def extract_text_with_pymupdf(file_path: str) -> List[Document]:
    """Extract text using PyMuPDF (fitz) library - often better for complex PDFs"""
    try:
        doc = fitz.open(file_path)
        documents = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            if text.strip():  # Only add non-empty pages
                document = Document(
                    page_content=text,
                    metadata={"source": file_path, "page": page_num}
                )
                documents.append(document)
        
        doc.close()
        logger.info(f"PyMuPDF extracted text from {len(documents)} pages")
        return documents
    except Exception as e:
        logger.error(f"PyMuPDF extraction failed: {str(e)}")
        return []


def process_pdf(file_path: str):
    """Process PDF with multiple fallback methods for robust text extraction"""
    try:
        # Check if file exists
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"PDF file not found at: {file_path}")
        
        logger.info(f"Processing PDF from: {file_path}")
        documents = []
        
        # Method 1: Try LangChain's PyPDFLoader (uses pypdf internally)
        try:
            logger.info("Attempting extraction with PyPDFLoader...")
            loader = PyPDFLoader(file_path)
            documents = loader.load()
            
            if documents and any(doc.page_content.strip() for doc in documents):
                logger.info(f"PyPDFLoader successfully loaded {len(documents)} pages")
            else:
                documents = []
                logger.warning("PyPDFLoader returned empty documents")
        except Exception as e:
            logger.warning(f"PyPDFLoader failed: {str(e)}")
        
        # Method 2: Try direct pypdf if available and previous method failed
        if not documents and PYPDF_AVAILABLE:
            logger.info("Attempting extraction with pypdf directly...")
            documents = extract_text_with_pypdf(file_path)
        
        # Method 3: Try PyMuPDF as fallback (often best for complex PDFs)
        if not documents and PYMUPDF_AVAILABLE:
            logger.info("Attempting extraction with PyMuPDF (fitz)...")
            documents = extract_text_with_pymupdf(file_path)
        
        # Validate that documents were loaded
        if not documents:
            raise ValueError(
                "Failed to extract text from PDF with all available methods. "
                "The PDF might be:\n"
                "1. Empty or corrupted\n"
                "2. Password-protected\n"
                "3. Scanned images without OCR (consider using pytesseract)\n"
                "4. Using unsupported encryption"
            )
        
        # Check if any text was actually extracted
        total_text = "".join([doc.page_content for doc in documents])
        if not total_text.strip():
            raise ValueError("No text content found in PDF. It may contain only images.")
        
        logger.info(f"Successfully extracted {len(total_text)} characters from {len(documents)} pages")
        
        # Split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=5000,
            chunk_overlap=500,
            length_function=len,
            separators=["\n\n", "\n", ". ", " ", ""]
        )
        splits = text_splitter.split_documents(documents)
        
        # Filter out empty chunks
        splits = [doc for doc in splits if doc.page_content.strip()]
        
        if not splits:
            raise ValueError("Text splitting resulted in zero valid chunks.")
        
        logger.info(f"Created {len(splits)} text chunks for vectorization")
        
        # Create vectorstore
        vectorstore = Chroma.from_documents(
            documents=splits,
            embedding=embeddings,
            persist_directory="./greenstep_education.db"
        )
        
        logger.info("Vectorstore created successfully")
        return vectorstore
        
    except FileNotFoundError as e:
        logger.error(f"File not found: {str(e)}")
        raise RuntimeError(f"PDF file not found: {str(e)}")
    except ValueError as e:
        logger.error(f"Invalid PDF content: {str(e)}")
        raise RuntimeError(f"PDF processing failed: {str(e)}")
    except Exception as e:
        logger.error(f"Unexpected error processing PDF: {str(e)}", exc_info=True)
        raise RuntimeError(f"PDF processing failed: {str(e)}")


def get_session_history(session_id: str) -> MongoDBChatMessageHistory:
    """Get MongoDB chat message history for a session"""
    return MongoDBChatMessageHistory(
        connection_string=MONGODB_URL,
        session_id=session_id,
        database_name=MONGODB_DATABASE,
        collection_name=MONGODB_COLLECTION,
        create_index=True
    )


# Initialize vectorstore
try:
    logger.info(f"Initializing vectorstore from PDF: {PDF_PATH}")
    vectorstore = process_pdf(PDF_PATH)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
    logger.info("Vectorstore initialized successfully")
except Exception as e:
    logger.error(f"Vectorstore initialization failed: {str(e)}")
    logger.error("\nTroubleshooting steps:")
    logger.error("1. Verify PDF file exists at the specified path")
    logger.error("2. Ensure PDF contains extractable text (not just scanned images)")
    logger.error("3. Check if PDF is password-protected")
    logger.error("4. Try opening the PDF manually to verify it's not corrupted")
    logger.error("\nInstall additional libraries for better PDF support:")
    logger.error("  pip install pypdf pymupdf")
    raise RuntimeError(f"Vectorstore initialization failed: {str(e)}")


class QuestionRequest(BaseModel):
    session_id: str
    question: str


class QuestionResponse(BaseModel):
    answer: str


class SessionHistoryRequest(BaseModel):
    session_id: str


class SessionHistoryResponse(BaseModel):
    session_id: str
    message_count: int
    messages: List[dict]


@app.post(
    "/ask",
    response_model=QuestionResponse,
    summary="Ask the GreenStep education assistant",
    description="Submit a question to learn about reforestation, trees, forests, and environmental conservation."
)
async def ask_question(request: QuestionRequest):
    """Handle question and maintain chat history in MongoDB"""
    session_id = request.session_id
    question = request.question
    logger.info(f"Received question for session {session_id}: {question}")

    try:
        # Get MongoDB chat history
        history = get_session_history(session_id)
        all_messages = history.messages
        last_messages = all_messages[-6:] if len(all_messages) > 6 else all_messages

        # Contextualize question
        contextualize_q_prompt = ChatPromptTemplate.from_messages([
            ("system", "Rephrase the user's question considering the chat history to provide better context."),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}")
        ])
        
        history_aware_retriever = create_history_aware_retriever(
            llm, retriever, contextualize_q_prompt
        )

        # System prompt for GreenStep
        system_prompt = """You are the GreenStep Education Assistant, a friendly, knowledgeable, and inspiring chatbot 
                            designed to educate users about reforestation, tree planting, forest conservation, and environmental 
                            sustainability within the GreenStep app's Education tab.

                            Your primary mission is to empower users with accurate, actionable knowledge about forests and their 
                            role in combating climate change, while fostering a deep appreciation for nature and encouraging 
                            environmental action.

                            Use the following verified educational content to answer questions:

                            {context}

                            Your responses should be:
                            1. Educational and engaging about reforestation, tree species, planting techniques, and environmental benefits
                            2. Scientifically accurate based on the provided educational content
                            3. Inspiring and action-oriented, motivating users to participate in reforestation
                            4. Accessible to diverse audiences with clear, jargon-free language
                            5. Balanced and honest about both opportunities and challenges
                            6. Interactive and conversational, building on previous discussions
                            7. Positive and solutions-focused, emphasizing hope and agency

                            Remember: Transform users from passive learners into informed environmental advocates who 
                            understand reforestation science and feel empowered to contribute through GreenStep.
                        """

        qa_prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}")
        ])
        
        question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
        rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

        # Get response
        result = rag_chain.invoke({
            "input": question,
            "chat_history": last_messages
        })
        raw_answer = result["answer"]

        # Remove <think>...</think> blocks
        cleaned_answer = re.sub(r"<think>.*?</think>\s*", "", raw_answer, flags=re.DOTALL).strip()

        # Update history
        history.add_user_message(question)
        history.add_ai_message(cleaned_answer)

        logger.info(f"Response saved to MongoDB for session {session_id}")
        return QuestionResponse(answer=cleaned_answer)

    except Exception as e:
        logger.error(f"Error processing question: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")


@app.post("/history", response_model=SessionHistoryResponse)
async def get_history(request: SessionHistoryRequest):
    """Retrieve chat history for a session"""
    try:
        history = get_session_history(request.session_id)
        messages = history.messages
        messages_dict = [{"type": msg.type, "content": msg.content} for msg in messages]
        return SessionHistoryResponse(
            session_id=request.session_id,
            message_count=len(messages),
            messages=messages_dict
        )
    except Exception as e:
        logger.error(f"Error retrieving history: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Failed to retrieve history: {str(e)}")


@app.delete("/history/{session_id}")
async def clear_history(session_id: str):
    """Clear chat history for a session"""
    try:
        history = get_session_history(session_id)
        history.clear()
        logger.info(f"Cleared history for session {session_id}")
        return {"message": f"History cleared for session {session_id}"}
    except Exception as e:
        logger.error(f"Error clearing history: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Failed to clear history: {str(e)}")


@app.get("/health")
async def health_check():
    """Health check endpoint"""
    try:
        mongo_client.admin.command('ping')
        mongo_status = "connected"
    except Exception as e:
        mongo_status = f"disconnected: {str(e)}"
    
    return {
        "status": "healthy",
        "app": "GreenStep Education Assistant",
        "mongodb": mongo_status,
        "vectorstore": "initialized" if vectorstore else "not initialized",
        "pdf_libraries": {
            "pypdf": PYPDF_AVAILABLE,
            "pymupdf": PYMUPDF_AVAILABLE
        }
    }


@app.get("/")
async def root():
    return {
        "message": "Welcome to GreenStep Education API",
        "description": "Learn about reforestation, tree planting, and environmental conservation",
        "endpoints": {
            "ask_question": "/ask",
            "get_history": "/history",
            "clear_history": "/history/{session_id}",
            "health_check": "/health",
            "documentation": "/docs"
        }
    }


@app.on_event("shutdown")
async def shutdown_event():
    """Close MongoDB connection"""
    mongo_client.close()
    logger.info("MongoDB connection closed")


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host=HOST, port=PORT)