from zipfile import ZipFile
from lxml import etree
from pathlib import Path

from pathlib import Path
import requests
import io
from urllib.parse import urlparse
import urllib.request

import fitz  

def extract_docx(docx_input) -> str:
    if isinstance(docx_input, (str, Path)):
        zipf = ZipFile(docx_input)
    elif isinstance(docx_input, io.BytesIO):
        zipf = ZipFile(docx_input)
    else:
        raise ValueError("Unsupported input type for extract_docx")

    xml_content = zipf.read("word/document.xml")
    tree = etree.fromstring(xml_content)

    ns = {
        "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
        "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
    }

    text_blocks = []

    # Extract paragraphs
    paragraphs = tree.xpath("//w:p", namespaces=ns)
    for p in paragraphs:
        texts = p.xpath(".//w:t", namespaces=ns)
        para_text = "".join(t.text for t in texts if t.text)
        if para_text.strip():
            text_blocks.append(para_text.strip())

    # Extract from text boxes
    tb_contents = tree.xpath("//w:txbxContent", namespaces=ns)
    for tb in tb_contents:
        texts = tb.xpath(".//w:t", namespaces=ns)
        tb_text = "".join(t.text for t in texts if t.text)
        if tb_text.strip():
            text_blocks.append(tb_text.strip())

    return "\n\n".join(text_blocks)

def extract_pdf(pdf_input) -> str:
    text = []

    if isinstance(pdf_input, (str, Path)):
        doc = fitz.open(pdf_input)
    elif isinstance(pdf_input, io.BytesIO):
        doc = fitz.open(stream=pdf_input, filetype="pdf")
    else:
        raise ValueError("Unsupported input type for extract_pdf")

    with doc:
        for page in doc:
            page_text = page.get_text("text")
            text.append(page_text)

    return "\n".join(text)


def detect_file_type_from_bytes(content: bytes) -> str:
    if content.startswith(b'%PDF'):
        return "pdf"
    elif content[0:2] == b'PK' and b'word/' in content:  # DOCX is a ZIP with word/ inside
        return "docx"
    elif all(chr(b).isprintable() or chr(b).isspace() for b in content[:100]):
        return "txt"
    return None

def convert_google_docs_url(url: str) -> str:
    if "docs.google.com" in url:
        # Extract document ID from various Google Docs URL formats
        if "/document/d/" in url:
            doc_id = url.split("/document/d/")[1].split("/")[0]
            return f"https://docs.google.com/document/d/{doc_id}/export?format=pdf"
        elif "id=" in url:
            doc_id = url.split("id=")[1].split("&")[0]
            return f"https://docs.google.com/document/d/{doc_id}/export?format=pdf"
        # Handle URLs like the one you provided with complex parameters
        elif "?usp=drive_link" in url or "rtpof=true" in url:
            # Extract doc ID from the full URL
            if "/d/" in url:
                doc_id = url.split("/d/")[1].split("/")[0]
                return f"https://docs.google.com/document/d/{doc_id}/export?format=pdf"
    return url

def extract(file_path_or_url: str):
    is_url = urlparse(file_path_or_url).scheme in ("http", "https")

    if is_url:
        url = convert_google_docs_url(url)
        try:
            response = requests.get(file_path_or_url)
            response.raise_for_status()
            content = response.content
            file_type = detect_file_type_from_bytes(content)
            file_like = io.BytesIO(content)
        except Exception as e:
            raise ValueError(f"Failed to fetch file: {e}")
    else:
        file_type = Path(file_path_or_url).suffix.lower().lstrip(".")
        file_like = file_path_or_url  # keep as path for local files

    if file_type == "pdf":
        text = extract_pdf(file_like if is_url else file_path_or_url)
        elements = partition_text(text=text)
    elif file_type == "docx":
        text = extract_docx(file_like if is_url else file_path_or_url)
        elements = partition_text(text=text)
    elif file_type == "txt":
        if is_url:
            text = content.decode("utf-8", errors="ignore")
        else:
            with open(file_path_or_url, 'r', encoding='utf-8') as f:
                text = f.read()
        elements = partition_text(text=text)
    else:
        raise ValueError("Unsupported or undetectable file type.")

    # chunking logic
    chunks = []
    section = "Unknown"
    for i, el in enumerate(elements):
        if el.category == "Title":
            section = el.text.strip()
        elif el.category in ["NarrativeText", "ListItem"]:
            chunks.append({
                "clause_id": f"auto_{i}",
                "section_title": section,
                "raw_text": el.text.strip(),
                "source_file": (
                    Path(file_path_or_url).name if not is_url else file_path_or_url.split("/")[-1]
                ),
                "position_in_doc": i
            })
    return chunks