Spaces:

tntrauser
/

test_document

Sleeping

App Files Files Community

devanghingu commited on Jul 21, 2025

Commit

7fcd17e

verified ·

1 Parent(s): 2c88cb5

Upload 4 files

Browse files

Files changed (4) hide show

app.py +227 -0
packages.txt +1 -0
requirements.txt +9 -0
s3_uploads.py +27 -0

app.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import argparse
+import copy
+import os
+import re
+import subprocess
+import tempfile
+import base64
+from pathlib import Path
+import fitz
+import gradio as gr
+import time
+import html
+from openai import OpenAI
+from s3_uploads import upload_to_s3
+from environs import env
+stop_generation = False
+def stream_from_vllm(messages):
+    global stop_generation
+    client = OpenAI(
+        base_url="https://router.huggingface.co/v1",
+        api_key=env.str("HF_API_KEY"),
+    )
+    response = client.chat.completions.create(
+        model="THUDM/GLM-4.1V-9B-Thinking:novita",
+        messages=messages,
+        temperature=0.01,
+        stream=True,
+        max_tokens=8000
+    )
+    for chunk in response:
+        if stop_generation:
+            break
+        if chunk.choices and chunk.choices[0].delta:
+            delta = chunk.choices[0].delta
+            yield delta
+class GLM4VModel:
+    def _strip_html(self, text: str) -> str:
+        return re.sub(r"<[^>]+>", "", text).strip()
+    def _wrap_text(self, text: str):
+        return [{"type": "text", "text": text}]
+    def _image_to_base64(self, image_path):
+        with open(image_path, "rb") as image_file:
+            encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
+            ext = Path(image_path).suffix.lower()
+            if ext in ['.jpg', '.jpeg']:
+                mime_type = 'image/jpeg'
+            elif ext == '.png':
+                mime_type = 'image/png'
+            elif ext == '.gif':
+                mime_type = 'image/gif'
+            elif ext == '.bmp':
+                mime_type = 'image/bmp'
+            elif ext in ['.tiff', '.tif']:
+                mime_type = 'image/tiff'
+            elif ext == '.webp':
+                mime_type = 'image/webp'
+            else:
+                mime_type = 'image/jpeg'
+            return f"data:{mime_type};base64,{encoded_string}"
+    def _pdf_to_imgs(self, pdf_path):
+        doc = fitz.open(pdf_path)
+        imgs = []
+        for i in range(doc.page_count):
+            pix = doc.load_page(i).get_pixmap(dpi=180)
+            img_p = os.path.join(tempfile.gettempdir(), f"{Path(pdf_path).stem}_{i}.png")
+            pix.save(img_p)
+            imgs.append(img_p)
+        doc.close()
+        return imgs
+    def _ppt_to_imgs(self, ppt_path):
+        tmp = tempfile.mkdtemp()
+        subprocess.run(
+            ["libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmp, ppt_path],
+            check=True,
+        )
+        pdf_path = os.path.join(tmp, Path(ppt_path).stem + ".pdf")
+        return self._pdf_to_imgs(pdf_path)
+    def _files_to_content(self, media):
+        out = []
+        for f in media or []:
+            ext = Path(f).suffix.lower()
+            if ext in [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".m4v"]:
+                out.append({"type": "video_url", "video_url": {"url": upload_to_s3(f)}})
+            elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
+                out.append({"type": "image_url", "image_url": {"url": upload_to_s3(f)}})
+            elif ext in [".ppt", ".pptx"]:
+                for p in self._ppt_to_imgs(f):
+                    out.append({"type": "image_url", "image_url": {"url": upload_to_s3(p)}})
+            elif ext == ".pdf":
+                for p in self._pdf_to_imgs(f):
+                    out.append({"type": "image_url", "image_url": {"url": upload_to_s3(p)}})
+        return out
+    def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = True):
+        think_html = ""
+        answer_md = ""
+        if reasoning_content and not skip_think:
+            reasoning_content_clean = reasoning_content.strip()
+            think_html = (
+                "### 💭 Thinking\n"
+                "<details open>\n"
+                "<summary>Click to expand</summary>\n\n"
+                f"{reasoning_content_clean}\n"
+                "</details>\n"
+            )
+        if content:
+            answer_md = content.strip()
+        return think_html + "\n\n" + answer_md
+    def _build_messages(self, raw_hist, sys_prompt):
+        msgs = []
+        if sys_prompt.strip():
+            msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
+        for h in raw_hist:
+            if h["role"] == "user":
+                msgs.append({"role": "user", "content": h["content"]})
+            else:
+                raw = re.sub(r"<details.*?</details>", "", h["content"], flags=re.DOTALL)
+                clean_content = self._strip_html(raw).strip()
+                if clean_content:
+                    msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)})
+        return msgs
+    def stream_generate(self, raw_hist, sys_prompt: str, *, skip_special_tokens: bool = False):
+        global stop_generation
+        stop_generation = False
+        msgs = self._build_messages(raw_hist, sys_prompt)
+        reasoning_buffer = ""
+        content_buffer = ""
+        try:
+            for delta in stream_from_vllm(msgs):
+                if stop_generation:
+                    break
+                if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
+                    reasoning_buffer += delta.reasoning_content
+                elif hasattr(delta, 'content') and delta.content:
+                    content_buffer += delta.content
+                else:
+                    if isinstance(delta, dict):
+                        if 'reasoning_content' in delta and delta['reasoning_content']:
+                            reasoning_buffer += delta['reasoning_content']
+                        if 'content' in delta and delta['content']:
+                            content_buffer += delta['content']
+                    elif hasattr(delta, 'content') and delta.content:
+                        content_buffer += delta.content
+                yield self._stream_fragment(reasoning_buffer, content_buffer)
+        except Exception as e:
+            error_msg = f"Error during streaming: {str(e)}"
+            yield self._stream_fragment("", error_msg)
+glm4v = GLM4VModel()
+sys_prompt = """Instructions:
+Extract only "BILL OF METERIAL" table containing columns same as it is!
+colums: (POSITION, DESCRIPTION, N PIECES, MATERIAL (like SA 516 Gr.70N or SA 105 N), DIMENSIONS(like 1700 I.D. X 2045H 50 THK.), WT.Kgs
+Ignore title blocks, revision notes, drawing numbers, and general annotations outside the "BILL OF METERIAL".
+If a page contains multiple tables, extract only those explicitly related to BILL OF METERIAL.
+Preserve the row and column structure as files.
+Do not include any surrounding decorative lines or borders—only clean tabular data.
+output format: markdown table format with following columns (POSITION, DESCRIPTION, N PIECES, MATERIAL, DIMENSIONS(like 1700 I.D. X 2045H 50 THK.) and WT.Kgs)"""
+def extract_table_from_file(file):
+    if file is None:
+        return "Please upload a file."
+    payload = glm4v._files_to_content([file.name])
+    raw_hist = [{"role": "user", "content": payload}]
+    full_response = ""
+    yield "<h2>🌀 Processing...</h2>\n"
+    try:
+        for chunk in glm4v.stream_generate(raw_hist, sys_prompt):
+            full_response = chunk
+            yield full_response
+    except Exception as e:
+        yield f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"
+theme = gr.themes.Ocean(
+    primary_hue="gray",
+)
+with gr.Blocks(title="demo", theme=theme) as demo:
+    gr.Markdown(
+        "<div style='text-align:center; margin-bottom:20px;'><h1> PDF Extraction Demo</h1></div"
+    )
+    with gr.Row():
+        with gr.Column():
+            up = gr.File(label="Upload File", type="filepath")
+            format_selector = gr.Radio(choices=["CSV", "JSON"], label="Output Format", value="CSV")
+            submit_btn = gr.Button("Submit", variant="primary")
+        with gr.Column():
+            output_markdown = gr.Markdown(label="Extracted Table")
+    submit_btn.click(
+        extract_table_from_file,
+        inputs=[up],
+        outputs=[output_markdown],
+    )
+if __name__ == "__main__":
+    demo.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ libreoffice

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio==5.25.0
+spaces>=0.37.1
+PyMuPDF>=1.26.1
+torchvision==0.20.1
+torch==2.5.1
+av>=14.4.0
+openai
+boto3
+environs

s3_uploads.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import boto3
+import uuid
+from environs import env
+AWS_SECRET_KEY=env.str("AWS_SECRET_KEY")
+AWS_ACCESS_KEY=env.str("AWS_ACCESS_KEY")
+BUCKET_NAME = env.str("BUCKET_NAME")
+AWS_REGION = env.str("AWS_REGION")
+AWS_USER=env.str("AWS_USER", default="default_user")
+s3 = boto3.client(
+    's3',
+    aws_access_key_id=AWS_ACCESS_KEY,
+    aws_secret_access_key=AWS_SECRET_KEY,
+    region_name=AWS_REGION
+  )
+def upload_to_s3(file_path):
+  _file_path = file_path.split("/")[-1]
+  _file_path = _file_path.split(".")
+  _file_path[-2] = _file_path[-2]+"_" + str(uuid.uuid4())
+  s3_key = ".".join(_file_path)
+  s3.upload_file(file_path, BUCKET_NAME, s3_key)
+  file_path = f"https://{BUCKET_NAME}.s3.{AWS_REGION}.amazonaws.com/{s3_key}"
+  return file_path