Spaces:
Build error
Build error
| import os | |
| import sys | |
| import logging | |
| from pathlib import Path | |
| from typing import List, Dict, Optional | |
| import gradio as gr | |
| from fastapi import FastAPI, HTTPException, status, UploadFile, File, BackgroundTasks | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from dotenv import load_dotenv | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.StreamHandler(sys.stdout) | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize FastAPI | |
| app = FastAPI( | |
| title="ParseAI API", | |
| description="API for processing and analyzing PDF documents", | |
| version="1.0.0" | |
| ) | |
| # CORS middleware configuration | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], # In production, replace with specific origins | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Directory configuration | |
| BASE_DIR = Path("/home/user/app/data") | |
| UPLOAD_DIR = BASE_DIR / "uploads" | |
| PROCESSED_DIR = BASE_DIR / "processed" | |
| NLTK_DATA_DIR = Path(os.getenv("NLTK_DATA", "/app/nltk_data")) | |
| # Ensure directories exist with proper permissions | |
| for directory in [BASE_DIR, UPLOAD_DIR, PROCESSED_DIR]: | |
| try: | |
| directory.mkdir(parents=True, exist_ok=True) | |
| # Set permissions to 0o777 (read/write/execute for all) | |
| directory.chmod(0o777) | |
| logger.info(f"Created directory: {directory}") | |
| except Exception as e: | |
| logger.error(f"Failed to create directory {directory}: {str(e)}") | |
| # Try to continue if directory creation fails | |
| if not directory.exists(): | |
| raise | |
| os.chmod(directory, 0o755) | |
| logger.info(f"Ensured directory exists: {directory}") | |
| # Import modules after environment is set up | |
| try: | |
| from extractor import pdf_extractor | |
| from summarizer import document_summarizer | |
| from vector_store import vector_store | |
| # Initialize NLTK data | |
| import nltk | |
| nltk.data.path.append(str(NLTK_DATA_DIR)) | |
| # Verify NLTK data is available | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| nltk.data.find('corpora/stopwords') | |
| nltk.data.find('corpora/wordnet') | |
| nltk.data.find('taggers/averaged_perceptron_tagger') | |
| logger.info("NLTK data verified successfully") | |
| except LookupError as e: | |
| logger.warning(f"NLTK data missing: {e}") | |
| # Attempt to download missing data | |
| try: | |
| nltk.download('punkt', download_dir=str(NLTK_DATA_DIR)) | |
| nltk.download('stopwords', download_dir=str(NLTK_DATA_DIR)) | |
| nltk.download('wordnet', download_dir=str(NLTK_DATA_DIR)) | |
| nltk.download('averaged_perceptron_tagger', download_dir=str(NLTK_DATA_DIR)) | |
| logger.info("Successfully downloaded NLTK data") | |
| except Exception as download_error: | |
| logger.error(f"Failed to download NLTK data: {download_error}") | |
| raise | |
| except ImportError as e: | |
| logger.error(f"Failed to import required modules: {e}") | |
| raise | |
| # Health check endpoint | |
| async def health_check(): | |
| """Health check endpoint for monitoring""" | |
| return { | |
| "status": "healthy", | |
| "environment": os.getenv("ENV", "development"), | |
| "nltk_data": str(NLTK_DATA_DIR), | |
| "upload_dir": str(UPLOAD_DIR), | |
| "processed_dir": str(PROCESSED_DIR) | |
| } | |
| async def process_document(file_path: str): | |
| """ | |
| Process a document by extracting text, summarizing it, and adding to the vector store. | |
| Args: | |
| file_path (str): Path to the file to process | |
| Returns: | |
| dict: Processing results including status, processed file path, and summary | |
| """ | |
| try: | |
| logger.info(f"Processing document: {file_path}") | |
| # PDF ํ ์คํธ ์ถ์ถ | |
| extracted_data = pdf_extractor.extract_text(file_path) | |
| logger.info(f"Extracted text from {len(extracted_data['text_by_page'])} pages") | |
| # ์ ์ฒด ํ ์คํธ ์ถ์ถ | |
| full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]]) | |
| # ํ ์คํธ ์์ฝ | |
| summary_result = document_summarizer.summarize_text(full_text) | |
| logger.info("Document summarization completed") | |
| # ๋ฒกํฐ ์ ์ฅ์์ ์ถ๊ฐ | |
| metadata = { | |
| "filename": extracted_data["filename"], | |
| "total_pages": extracted_data["total_pages"], | |
| "summary": summary_result.get("full_summary", ""), | |
| "timestamp": extracted_data.get("timestamp", "") | |
| } | |
| vector_store.add_document(full_text, metadata) | |
| logger.info("Document added to vector store") | |
| # ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐ ์ ์ฅ | |
| processed_path = pdf_extractor.save_extracted_text( | |
| { | |
| **extracted_data, | |
| "summary": summary_result.get("full_summary", ""), | |
| "chunk_summaries": summary_result.get("chunk_summaries", []) | |
| }, | |
| str(PROCESSED_DIR) | |
| ) | |
| logger.info(f"Processed data saved to {processed_path}") | |
| return { | |
| "status": "success", | |
| "processed_file": processed_path, | |
| "summary": summary_result.get("full_summary", "") | |
| } | |
| except Exception as e: | |
| error_msg = f"Error processing document: {str(e)}" | |
| logger.error(error_msg, exc_info=True) | |
| raise Exception(error_msg) | |
| async def upload_pdf( | |
| file: UploadFile = File(...), | |
| background_tasks: BackgroundTasks = None | |
| ): | |
| """PDF ํ์ผ ์ ๋ก๋ API""" | |
| if not file.filename.lower().endswith('.pdf'): | |
| raise HTTPException(status_code=400, detail="PDF ํ์ผ๋ง ์ ๋ก๋ ๊ฐ๋ฅํฉ๋๋ค") | |
| file_path = UPLOAD_DIR / file.filename | |
| try: | |
| # ํ์ผ ์ ์ฅ | |
| with open(file_path, "wb") as buffer: | |
| content = await file.read() | |
| buffer.write(content) | |
| # ๋น๋๊ธฐ๋ก ๋ฌธ์ ์ฒ๋ฆฌ ์์ | |
| background_tasks.add_task(process_document, str(file_path)) | |
| return {"filename": file.filename, "status": "processing"} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def search_documents(query: str, top_k: int = 5): | |
| """๋ฌธ์ ๊ฒ์ API""" | |
| try: | |
| results = vector_store.search(query, top_k) | |
| return {"results": results} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # Gradio ์ธํฐํ์ด์ค ์์ฑ | |
| def process_file(file): | |
| file_path = UPLOAD_DIR / file.name | |
| with open(file_path, "wb") as buffer: | |
| buffer.write(file.getbuffer()) | |
| result = process_document(str(file_path)) | |
| return result["summary"] | |
| def search(query): | |
| results = vector_store.search(query) | |
| return "\n\n".join([f"{r['filename']} - ์ ์ฌ๋: {r['similarity']:.2f}" for r in results["results"]]) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# ParseAI PDF ๋ถ์ ์๋น์ค") | |
| with gr.Tab("PDF ์ ๋ก๋"): | |
| file_input = gr.File(type="file", file_types=[".pdf"]) | |
| upload_button = gr.Button("์ ๋ก๋") | |
| summary_output = gr.Textbox(label="์์ฝ") | |
| upload_button.click( | |
| process_file, | |
| inputs=[file_input], | |
| outputs=[summary_output] | |
| ) | |
| with gr.Tab("๋ฌธ์ ๊ฒ์"): | |
| search_input = gr.Textbox(label="๊ฒ์์ด ์ ๋ ฅ") | |
| search_button = gr.Button("๊ฒ์") | |
| search_output = gr.Textbox(label="๊ฒ์ ๊ฒฐ๊ณผ") | |
| search_button.click( | |
| search, | |
| inputs=[search_input], | |
| outputs=[search_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |