import gradio as gr
import pdfplumber
import docx
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from PIL import Image
import pytesseract
import torch
import os
import spaces

# 🔐 Authenticate Hugging Face token
login(token=os.environ.get("token"))

# ✅ Ensure GPU is available
if not torch.cuda.is_available():
    raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")

# 🧠 Model
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

# 📄 Document extractors
def extract_text_from_pdf(file):
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
            else:
                img = page.to_image(resolution=300).original
                ocr_text = pytesseract.image_to_string(img)
                text += ocr_text + "\n"
    return text

def extract_text_from_docx(file):
    doc = docx.Document(file)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])

def chunk_text(text, max_chars=6000):
    paragraphs = text.split("\n")
    chunks, current_chunk = [], ""
    for para in paragraphs:
        if len(current_chunk) + len(para) < max_chars:
            current_chunk += para + "\n"
        else:
            chunks.append(current_chunk)
            current_chunk = para + "\n"
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

# 🧾 Q&A Prompt Template
def create_prompt(text_chunk):
    return f"""
You are an expert in analyzing U.S. government tender documents. Based only on the content provided below, answer the following 20 standard questions in Q&A format. If something is not mentioned, write "Not mentioned in the provided document."

CONTENT:
{text_chunk}

Now provide answers for:

Q1: What is the general scope of the tender?  
Q2: Are certifications like SAM, CMMI, ISO, SBA, 8(a), or GSA required?  
Q3: Is there a Set-aside status (e.g., 8a, SDVOSB)?  
Q4: Are U.S. citizens or security-cleared staff required?  
Q5: What is the expected team size or key qualifications?  
Q6: Are offshore resources allowed?  
Q7: What is the mode of working (On-site/Remote/Hybrid)?  
Q8: Is presence in specific regions/states required?  
Q9: Is the delivery location defined?  
Q10: Is remote or offshore delivery allowed?  
Q11: Is a U.S. office presence required?  
Q12: Are travel/lodging expenses reimbursable?  
Q13: Are cybersecurity frameworks (FedRAMP, NIST, HIPAA) required?  
Q14: Are background checks or security clearance needed?  
Q15: Is past experience required?  
Q16: How many references are required?  
Q17: Are only U.S. references accepted?  
Q18: Is private sector experience allowed?  
Q19: Do references need to be identified?  
Q20: Is subcontracting permitted?

Answer clearly and in the same format:
Q1: ...
A1: ...
Q2: ...
A2: ...
...
"""

# 🧼 Cleaner
def clean_output(raw_output):
    lines = raw_output.splitlines()
    cleaned_lines = []
    started = False

    for line in lines:
        if line.strip().startswith("Q1:"):
            started = True
        if started:
            cleaned_lines.append(line)

    stop_idx = len(cleaned_lines)
    for i, line in enumerate(cleaned_lines[5:], 5):
        if "CONTENT:" in line or "You are an expert" in line:
            stop_idx = i
            break

    return "\n".join(cleaned_lines[:stop_idx]).strip()

# 🚀 Main analysis function
@spaces.GPU(duration=60)
def analyze_document(file, cancel_flag):
    ext = os.path.splitext(file.name)[-1].lower()

    if ext == ".pdf":
        raw_text = extract_text_from_pdf(file)
    elif ext == ".docx":
        raw_text = extract_text_from_docx(file)
    else:
        return "❌ Unsupported file format. Please upload a PDF or DOCX.", "❌ Invalid format"

    if len(raw_text.strip()) == 0:
        return "❌ No text found in the document.", "❌ Empty document"

    chunks = chunk_text(raw_text)
    full_summary = ""

    tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,
        token=os.environ.get("token"),
        trust_remote_code=True
    )
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

    for i, chunk in enumerate(chunks):
        if cancel_flag:
            return "⛔ Analysis cancelled by user.", "⛔ Terminated by user"

        status_msg = f"🔄 Processing chunk {i+1} of {len(chunks)}..."
        prompt = create_prompt(chunk)
        result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
        cleaned = clean_output(result)
        full_summary += cleaned + "\n\n---\n\n"

    return full_summary.strip(), "✅ Completed"

# 🌐 Gradio Interface
with gr.Blocks(title="Smart Tender Analyzer - US Edition") as demo:
    gr.Markdown("## 📄 Document Analyzer – Extract important information using Transformer (GPU-Accelerated)")

    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(label="📎 Upload Tender Document (PDF/DOCX)")
            with gr.Row():
                analyze_button = gr.Button("🔍 Analyze", variant="primary")
                terminate_button = gr.Button("❌ Terminate", variant="stop")
            status_box = gr.Textbox(label="📊 Status", value="⏳ Waiting for input...", interactive=False)

        with gr.Column(scale=2):
            output_box = gr.Textbox(label="🧠 Extracted Tender key information", lines=30, interactive=False)

    cancel_flag = gr.State(False)

    analyze_button.click(
        fn=analyze_document,
        inputs=[file_input, cancel_flag],
        outputs=[output_box, status_box]
    )

    terminate_button.click(
        fn=lambda: gr.update(value=True),
        inputs=[],
        outputs=[cancel_flag]
    )

demo.launch(server_name="0.0.0.0", server_port=7860, share=True)