import os import sys import logging from pathlib import Path from typing import List, Dict, Optional import gradio as gr from fastapi import FastAPI, HTTPException, status, UploadFile, File, BackgroundTasks from fastapi.middleware.cors import CORSMiddleware from dotenv import load_dotenv # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger(__name__) # Load environment variables load_dotenv() # Initialize FastAPI app = FastAPI( title="ParseAI API", description="API for processing and analyzing PDF documents", version="1.0.0" ) # CORS middleware configuration app.add_middleware( CORSMiddleware, allow_origins=["*"], # In production, replace with specific origins allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Directory configuration BASE_DIR = Path("/home/user/app/data") UPLOAD_DIR = BASE_DIR / "uploads" PROCESSED_DIR = BASE_DIR / "processed" NLTK_DATA_DIR = Path(os.getenv("NLTK_DATA", "/app/nltk_data")) # Ensure directories exist with proper permissions for directory in [BASE_DIR, UPLOAD_DIR, PROCESSED_DIR]: try: directory.mkdir(parents=True, exist_ok=True) # Set permissions to 0o777 (read/write/execute for all) directory.chmod(0o777) logger.info(f"Created directory: {directory}") except Exception as e: logger.error(f"Failed to create directory {directory}: {str(e)}") # Try to continue if directory creation fails if not directory.exists(): raise os.chmod(directory, 0o755) logger.info(f"Ensured directory exists: {directory}") # Import modules after environment is set up try: from extractor import pdf_extractor from summarizer import document_summarizer from vector_store import vector_store # Initialize NLTK data import nltk nltk.data.path.append(str(NLTK_DATA_DIR)) # Verify NLTK data is available try: nltk.data.find('tokenizers/punkt') nltk.data.find('corpora/stopwords') nltk.data.find('corpora/wordnet') nltk.data.find('taggers/averaged_perceptron_tagger') logger.info("NLTK data verified successfully") except LookupError as e: logger.warning(f"NLTK data missing: {e}") # Attempt to download missing data try: nltk.download('punkt', download_dir=str(NLTK_DATA_DIR)) nltk.download('stopwords', download_dir=str(NLTK_DATA_DIR)) nltk.download('wordnet', download_dir=str(NLTK_DATA_DIR)) nltk.download('averaged_perceptron_tagger', download_dir=str(NLTK_DATA_DIR)) logger.info("Successfully downloaded NLTK data") except Exception as download_error: logger.error(f"Failed to download NLTK data: {download_error}") raise except ImportError as e: logger.error(f"Failed to import required modules: {e}") raise # Health check endpoint @app.get("/health") async def health_check(): """Health check endpoint for monitoring""" return { "status": "healthy", "environment": os.getenv("ENV", "development"), "nltk_data": str(NLTK_DATA_DIR), "upload_dir": str(UPLOAD_DIR), "processed_dir": str(PROCESSED_DIR) } async def process_document(file_path: str): """ Process a document by extracting text, summarizing it, and adding to the vector store. Args: file_path (str): Path to the file to process Returns: dict: Processing results including status, processed file path, and summary """ try: logger.info(f"Processing document: {file_path}") # PDF 텍스트 추출 extracted_data = pdf_extractor.extract_text(file_path) logger.info(f"Extracted text from {len(extracted_data['text_by_page'])} pages") # 전체 텍스트 추출 full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]]) # 텍스트 요약 summary_result = document_summarizer.summarize_text(full_text) logger.info("Document summarization completed") # 벡터 저장소에 추가 metadata = { "filename": extracted_data["filename"], "total_pages": extracted_data["total_pages"], "summary": summary_result.get("full_summary", ""), "timestamp": extracted_data.get("timestamp", "") } vector_store.add_document(full_text, metadata) logger.info("Document added to vector store") # 처리된 데이터 저장 processed_path = pdf_extractor.save_extracted_text( { **extracted_data, "summary": summary_result.get("full_summary", ""), "chunk_summaries": summary_result.get("chunk_summaries", []) }, str(PROCESSED_DIR) ) logger.info(f"Processed data saved to {processed_path}") return { "status": "success", "processed_file": processed_path, "summary": summary_result.get("full_summary", "") } except Exception as e: error_msg = f"Error processing document: {str(e)}" logger.error(error_msg, exc_info=True) raise Exception(error_msg) @app.post("/upload/pdf") async def upload_pdf( file: UploadFile = File(...), background_tasks: BackgroundTasks = None ): """PDF 파일 업로드 API""" if not file.filename.lower().endswith('.pdf'): raise HTTPException(status_code=400, detail="PDF 파일만 업로드 가능합니다") file_path = UPLOAD_DIR / file.filename try: # 파일 저장 with open(file_path, "wb") as buffer: content = await file.read() buffer.write(content) # 비동기로 문서 처리 시작 background_tasks.add_task(process_document, str(file_path)) return {"filename": file.filename, "status": "processing"} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/search") async def search_documents(query: str, top_k: int = 5): """문서 검색 API""" try: results = vector_store.search(query, top_k) return {"results": results} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) # Gradio 인터페이스 생성 def process_file(file): file_path = UPLOAD_DIR / file.name with open(file_path, "wb") as buffer: buffer.write(file.getbuffer()) result = process_document(str(file_path)) return result["summary"] def search(query): results = vector_store.search(query) return "\n\n".join([f"{r['filename']} - 유사도: {r['similarity']:.2f}" for r in results["results"]]) with gr.Blocks() as demo: gr.Markdown("# ParseAI PDF 분석 서비스") with gr.Tab("PDF 업로드"): file_input = gr.File(type="file", file_types=[".pdf"]) upload_button = gr.Button("업로드") summary_output = gr.Textbox(label="요약") upload_button.click( process_file, inputs=[file_input], outputs=[summary_output] ) with gr.Tab("문서 검색"): search_input = gr.Textbox(label="검색어 입력") search_button = gr.Button("검색") search_output = gr.Textbox(label="검색 결과") search_button.click( search, inputs=[search_input], outputs=[search_output] ) if __name__ == "__main__": demo.launch()