Spaces:
Build error
Build error
| import os | |
| import sys | |
| import logging | |
| from pathlib import Path | |
| from typing import List, Dict, Optional | |
| import gradio as gr | |
| from fastapi import FastAPI, HTTPException, status, UploadFile, File, BackgroundTasks | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from dotenv import load_dotenv | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.StreamHandler(sys.stdout) | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize FastAPI | |
| app = FastAPI( | |
| title="ParseAI API", | |
| description="API for processing and analyzing PDF documents", | |
| version="1.0.0" | |
| ) | |
| # CORS middleware configuration | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], # In production, replace with specific origins | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Directory configuration | |
| BASE_DIR = Path("/home/user/app/data") | |
| UPLOAD_DIR = BASE_DIR / "uploads" | |
| PROCESSED_DIR = BASE_DIR / "processed" | |
| # Use system NLTK data directory that we'll populate in the Dockerfile | |
| NLTK_DATA_DIR = Path("/usr/local/share/nltk_data") | |
| # Ensure directories exist with proper permissions | |
| for directory in [BASE_DIR, UPLOAD_DIR, PROCESSED_DIR]: | |
| try: | |
| directory.mkdir(parents=True, exist_ok=True) | |
| # Set permissions to 0o777 (read/write/execute for all) | |
| directory.chmod(0o777) | |
| logger.info(f"Created directory: {directory}") | |
| except Exception as e: | |
| logger.error(f"Failed to create directory {directory}: {str(e)}") | |
| # Try to continue if directory creation fails | |
| if not directory.exists(): | |
| raise | |
| os.chmod(directory, 0o755) | |
| logger.info(f"Ensured directory exists: {directory}") | |
| # Import modules after environment is set up | |
| try: | |
| from extractor import pdf_extractor | |
| from summarizer import document_summarizer | |
| from vector_store import vector_store | |
| # Initialize NLTK data | |
| import nltk | |
| # Set NLTK data path - system path first, then user path | |
| nltk_data_paths = [ | |
| str(NLTK_DATA_DIR), | |
| '/usr/local/share/nltk_data', | |
| '/usr/share/nltk_data', | |
| '/usr/local/nltk_data', | |
| '/usr/local/share/nltk_data', | |
| '/usr/local/lib/nltk_data', | |
| '/usr/share/nltk_data', | |
| '/usr/local/share/nltk_data', | |
| '/usr/lib/nltk_data', | |
| '/usr/local/lib/nltk_data', | |
| '/root/nltk_data', | |
| '/home/user/nltk_data' | |
| ] | |
| # Add all possible NLTK data paths | |
| nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path)) | |
| # Verify NLTK data is available | |
| required_nltk_data = [ | |
| 'tokenizers/punkt', | |
| 'corpora/stopwords', | |
| 'corpora/wordnet', | |
| 'taggers/averaged_perceptron_tagger' | |
| ] | |
| for resource in required_nltk_data: | |
| try: | |
| nltk.data.find(resource) | |
| logger.info(f"NLTK resource found: {resource}") | |
| except LookupError as e: | |
| logger.warning(f"NLTK resource not found: {resource}") | |
| # Try to download the resource if not found | |
| try: | |
| resource_name = resource.split('/')[-1].split('.')[0] | |
| logger.info(f"Attempting to download NLTK resource: {resource_name}") | |
| nltk.download(resource_name, download_dir=str(NLTK_DATA_DIR)) | |
| nltk.data.path.append(str(NLTK_DATA_DIR)) | |
| nltk.data.find(resource) # Try to find it again after download | |
| logger.info(f"Successfully downloaded NLTK resource: {resource}") | |
| except Exception as download_error: | |
| logger.error(f"Failed to download NLTK resource {resource}: {str(download_error)}") | |
| except ImportError as e: | |
| logger.error(f"Failed to import required modules: {e}") | |
| raise | |
| # Health check endpoint | |
| async def health_check(): | |
| """Health check endpoint for monitoring""" | |
| return { | |
| "status": "healthy", | |
| "environment": os.getenv("ENV", "development"), | |
| "nltk_data": str(NLTK_DATA_DIR), | |
| "upload_dir": str(UPLOAD_DIR), | |
| "processed_dir": str(PROCESSED_DIR) | |
| } | |
| def process_document(file_path: str): | |
| """ | |
| Process a document by extracting text, summarizing it, and adding to the vector store. | |
| Args: | |
| file_path (str): Path to the file to process | |
| Returns: | |
| dict: Processing results including status, processed file path, and summary | |
| """ | |
| try: | |
| logger.info(f"Processing document: {file_path}") | |
| # PDF ν μ€νΈ μΆμΆ | |
| extracted_data = pdf_extractor.extract_text(file_path) | |
| logger.info(f"Extracted text from {len(extracted_data['text_by_page'])} pages") | |
| # μ 체 ν μ€νΈ μΆμΆ | |
| full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]]) | |
| # ν μ€νΈ μμ½ | |
| summary_result = document_summarizer.summarize_text(full_text) | |
| logger.info("Document summarization completed") | |
| # λ²‘ν° μ μ₯μμ μΆκ° | |
| metadata = { | |
| "filename": extracted_data["filename"], | |
| "total_pages": extracted_data["total_pages"], | |
| "summary": summary_result.get("full_summary", ""), | |
| "timestamp": extracted_data.get("timestamp", "") | |
| } | |
| vector_store.add_document(full_text, metadata) | |
| logger.info("Document added to vector store") | |
| # μ²λ¦¬λ λ°μ΄ν° μ μ₯ | |
| processed_path = pdf_extractor.save_extracted_text( | |
| { | |
| **extracted_data, | |
| "summary": summary_result.get("full_summary", ""), | |
| "chunk_summaries": summary_result.get("chunk_summaries", []) | |
| }, | |
| str(PROCESSED_DIR) | |
| ) | |
| logger.info(f"Processed data saved to {processed_path}") | |
| return { | |
| "status": "success", | |
| "processed_file": processed_path, | |
| "summary": summary_result.get("full_summary", "") | |
| } | |
| except Exception as e: | |
| error_msg = f"Error processing document: {str(e)}" | |
| logger.error(error_msg, exc_info=True) | |
| raise Exception(error_msg) | |
| async def upload_pdf( | |
| file: UploadFile = File(...), | |
| background_tasks: BackgroundTasks = None | |
| ): | |
| """PDF νμΌ μ λ‘λ API""" | |
| if not file.filename.lower().endswith('.pdf'): | |
| raise HTTPException(status_code=400, detail="PDF νμΌλ§ μ λ‘λ κ°λ₯ν©λλ€") | |
| file_path = UPLOAD_DIR / file.filename | |
| try: | |
| # νμΌ μ μ₯ | |
| with open(file_path, "wb") as buffer: | |
| content = await file.read() | |
| buffer.write(content) | |
| # λΉλκΈ°λ‘ λ¬Έμ μ²λ¦¬ μμ | |
| background_tasks.add_task(process_document, str(file_path)) | |
| return {"filename": file.filename, "status": "processing"} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def search_documents(query: str, top_k: int = 5): | |
| """λ¬Έμ κ²μ API""" | |
| try: | |
| results = vector_store.search(query, top_k) | |
| return {"results": results} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # GradIO μΈν°νμ΄μ€ μμ± | |
| def process_file(file_path): | |
| """Process the uploaded file and return the summary""" | |
| # file_path is already a string path from Gradio's type="filepath" | |
| if not file_path or not os.path.exists(file_path): | |
| return "νμΌμ μ°Ύμ μ μμ΅λλ€. λ€μ μλν΄μ£ΌμΈμ." | |
| try: | |
| result = process_document(file_path) | |
| return result.get("summary", "μμ½μ μμ±ν μ μμ΅λλ€.") | |
| except Exception as e: | |
| logger.error(f"Error processing file: {str(e)}", exc_info=True) | |
| return f"νμΌ μ²λ¦¬ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}" | |
| def search(query): | |
| results = vector_store.search(query) | |
| return "\n\n".join([f"{r['filename']} - μ μ¬λ: {r['similarity']:.2f}" for r in results["results"]]) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# ParseAI PDF λΆμ μλΉμ€") | |
| with gr.Tab("PDF μ λ‘λ"): | |
| file_input = gr.File( | |
| label="PDF νμΌμ μ ννμΈμ", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ) | |
| upload_button = gr.Button("μ λ‘λ") | |
| summary_output = gr.Textbox(label="μμ½") | |
| upload_button.click( | |
| process_file, | |
| inputs=[file_input], | |
| outputs=[summary_output] | |
| ) | |
| with gr.Tab("λ¬Έμ κ²μ"): | |
| search_input = gr.Textbox(label="κ²μμ΄ μ λ ₯") | |
| search_button = gr.Button("κ²μ") | |
| search_output = gr.Textbox(label="κ²μ κ²°κ³Ό") | |
| search_button.click( | |
| search, | |
| inputs=[search_input], | |
| outputs=[search_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |