# from fastapi import FastAPI, UploadFile, File
# # from transformers import AutoProcessor, AutoModelForVision2Seq,AutoModel
# from transformers import AutoProcessor,AutoModel
# from PIL import Image
# import torch
# import io

# app = FastAPI()

# MODEL_ID = "zai-org/GLM-OCR"

# print("Loading GLM-OCR model...")

# # processor = AutoProcessor.from_pretrained(MODEL_ID)
# # model = AutoModelForVision2Seq.from_pretrained(
# #     MODEL_ID,
# #     torch_dtype=torch.float32
# # )

# # processor = AutoProcessor.from_pretrained(
# #     MODEL_ID,
# #     trust_remote_code=True
# # )

# # model = AutoModelForVision2Seq.from_pretrained(
# #     MODEL_ID,
# #     trust_remote_code=True,
# #     torch_dtype=torch.float32
# # )

# processor = AutoProcessor.from_pretrained(
#     MODEL_ID,
#     trust_remote_code=True
# )

# model = AutoModel.from_pretrained(
#     MODEL_ID,
#     trust_remote_code=True
# )

# @app.get("/")
# async def root():
#     return {"status": "GLM-OCR API is running"}

# @app.post("/ocr")
# async def extract_text(file: UploadFile = File(...)):
#     try:
#         contents = await file.read()
#         image = Image.open(io.BytesIO(contents)).convert("RGB")

#         # inputs = processor(images=image, return_tensors="pt")
#         inputs = processor(
#         text="Extract all text from the document",
#         images=image,
#         return_tensors="pt"
# )

#         with torch.no_grad():
#             outputs = model.generate(**inputs, max_new_tokens=1024)

#         text = processor.batch_decode(outputs, skip_special_tokens=True)[0]

#         return {
#             "success": True,
#             "text": text
#         }

#     except Exception as e:
#         return {
#             "success": False,
#             "error": str(e)
#         }


from fastapi import FastAPI, UploadFile, File
from transformers import AutoProcessor, GlmOcrForConditionalGeneration
from PIL import Image
import torch
import io

app = FastAPI()

MODEL_ID = "zai-org/GLM-OCR"

print("Loading GLM-OCR model...")

# Initialize Processor and Model specifically for GLM-OCR
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = GlmOcrForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float32 # Use torch.bfloat16 if you have a GPU
).eval()

@app.get("/")
async def root():
    return {"status": "GLM-OCR API is running"}

@app.post("/ocr")
async def extract_text(file: UploadFile = File(...)):
    try:
        # Read and prepare image
        contents = await file.read()
        image = Image.open(io.BytesIO(contents)).convert("RGB")

        # 1. Define the conversation structure
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": "Extract all text from this image."}
                ],
            }
        ]

     
        # 2. Use the chat template to prepare inputs
        # This fixes the 'NoneType' error by providing valid input_ids
        inputs = processor.apply_chat_template(
            messages,
            
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        )

        # 3. Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=False
            )

        # 4. Decode the result
        # We slice the output to remove the prompt tokens and keep only the response
        generated_ids = outputs[:, inputs['input_ids'].shape[1]:]
        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        return {
            "success": True,
            "text": text.strip()
        }

    except Exception as e:
        return {
            "success": False,
            "error": str(e)
        }