import streamlit as st
import pandas as pd
from mistralai import Mistral
import os
import io
import zipfile

# --- Page Config ---
st.set_page_config(page_title="PDF Loader", layout="wide")

st.title("📄 PDF Loader")
st.markdown("""
This tool allows you to upload multiple PDF files, process them using **Mistral's OCR**.
**Output:**
1. A ZIP file containing full text files for each PDF.
2. A CSV with every sentence having its metadata.
""")

# --- Sidebar: Configuration ---
with st.sidebar:
    st.header("Configuration")
    api_key = st.text_input("Enter Mistral API Key", type="password")
    st.caption("Your key is not stored and is only used for this session.")
    
    st.divider()
    st.info("Ensure you have access to `mistral-ocr-latest`.")

# --- Helper Function: Process Single PDF ---
def process_pdf(client, uploaded_file):
    """
    Uploads file to Mistral, gets a signed URL, runs OCR, and extracts text.
    """
    try:
        # 1. Upload the file to Mistral
        # using .getvalue() to satisfy Pydantic strict typing
        uploaded_mistral_file = client.files.upload(
            file={
                "file_name": uploaded_file.name,
                "content": uploaded_file.getvalue(), 
            },
            purpose="ocr"
        )
        
        # 2. Get a signed URL (temporary)
        signed_url = client.files.get_signed_url(file_id=uploaded_mistral_file.id)
        
        # 3. Process with Mistral OCR
        ocr_response = client.ocr.process(
            model="mistral-ocr-latest",
            document={
                "type": "document_url",
                "document_url": signed_url.url
            },
            include_image_base64=False
        )
        
        # 4. Extract text from the response
        full_text = ""
        if hasattr(ocr_response, 'pages'):
            for page in ocr_response.pages:
                full_text += page.markdown + " " # Adding space to ensure sentences don't merge
        
        return full_text
        
    except Exception as e:
        st.error(f"Error processing {uploaded_file.name}: {e}")
        return None

# --- Main Interface ---

uploaded_files = st.file_uploader(
    "Upload PDF files", 
    type=["pdf"], 
    accept_multiple_files=True
)

if st.button("Start PDF Processing", type="primary"):
    if not api_key:
        st.error("Please provide your Mistral API Key in the sidebar.")
    elif not uploaded_files:
        st.warning("Please upload at least one PDF file.")
    else:
        # Initialize Mistral Client
        client = Mistral(api_key=api_key)
        
        # Containers for results
        processed_data = [] # For CSV
        zip_buffer = io.BytesIO() # For ZIP of TXT files
        
        # Progress Bar
        progress_bar = st.progress(0)
        status_text = st.empty()
        
        with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
            for idx, file in enumerate(uploaded_files):
                status_text.text(f"Processing file {idx + 1} of {len(uploaded_files)}: {file.name}...")
                
                # Perform OCR
                extracted_text = process_pdf(client, file)
                
                if extracted_text:
                    # --- LOGIC CHANGE FOR CSV ---
                    # Split text by full stop to create rows per sentence
                    sentences = extracted_text.split('.')
                    
                    for sentence in sentences:
                        # Clean whitespace
                        clean_sentence = sentence.strip()
                        
                        # Only add if the sentence is not empty
                        if clean_sentence:
                            processed_data.append({
                                "text": clean_sentence,
                                "source_file": file.name
                            })
                    
                    # --- LOGIC FOR ZIP (Keep Full Text) ---
                    txt_filename = f"{os.path.splitext(file.name)[0]}.txt"
                    zf.writestr(txt_filename, extracted_text)
                
                # Update progress
                progress_bar.progress((idx + 1) / len(uploaded_files))
        
        status_text.text("Processing Complete!")
        
        # --- Display Results & Downloads ---
        if processed_data:
            st.success(f"Successfully processed {len(uploaded_files)} files.")
            
            # Create DataFrame
            df = pd.DataFrame(processed_data)
            
            # Layout for download buttons
            col1, col2 = st.columns(2)
            
            with col1:
                st.subheader("1. Download CSV")
                csv = df.to_csv(index=False).encode('utf-8')
                st.download_button(
                    label="Download CSV (Sentences)",
                    data=csv,
                    file_name="pdf_sentences.csv",
                    mime="text/csv",
                )
                
            with col2:
                st.subheader("2. Download Full Text Files")
                st.download_button(
                    label="Download All .txt Files (ZIP)",
                    data=zip_buffer.getvalue(),
                    file_name="individual_pdf_files.zip",
                    mime="application/zip"
                )
            
            # Preview Data
            with st.expander("Preview Extracted Sentences"):
                st.dataframe(df)