Spaces:

tntrauser
/

test_document

Sleeping

App Files Files Community

test_document / app.py

devanghingu

update app.py

b13eac9 verified 6 months ago

raw

history blame contribute delete

8.18 kB

	import argparse
	import copy
	import os
	import re
	import subprocess
	import tempfile
	import base64
	from pathlib import Path
	import fitz
	import gradio as gr
	import time
	import html
	from openai import OpenAI
	from s3_uploads import upload_to_s3
	from environs import env


	stop_generation = False

	def stream_from_vllm(messages):
	global stop_generation
	client = OpenAI(
	base_url="https://router.huggingface.co/v1",
	api_key=env.str("HF_API_KEY"),
	)


	response = client.chat.completions.create(
	model="THUDM/GLM-4.1V-9B-Thinking:novita",
	messages=messages,
	temperature=0.01,
	stream=True,
	max_tokens=8000
	)

	for chunk in response:
	if stop_generation:
	break

	if chunk.choices and chunk.choices[0].delta:
	delta = chunk.choices[0].delta
	yield delta


	class GLM4VModel:
	def _strip_html(self, text: str) -> str:
	return re.sub(r"<[^>]+>", "", text).strip()

	def _wrap_text(self, text: str):
	return [{"type": "text", "text": text}]

	def _image_to_base64(self, image_path):
	with open(image_path, "rb") as image_file:
	encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
	ext = Path(image_path).suffix.lower()
	if ext in ['.jpg', '.jpeg']:
	mime_type = 'image/jpeg'
	elif ext == '.png':
	mime_type = 'image/png'
	elif ext == '.gif':
	mime_type = 'image/gif'
	elif ext == '.bmp':
	mime_type = 'image/bmp'
	elif ext in ['.tiff', '.tif']:
	mime_type = 'image/tiff'
	elif ext == '.webp':
	mime_type = 'image/webp'
	else:
	mime_type = 'image/jpeg'

	return f"data:{mime_type};base64,{encoded_string}"

	def _pdf_to_imgs(self, pdf_path):
	doc = fitz.open(pdf_path)
	imgs = []
	for i in range(doc.page_count):
	pix = doc.load_page(i).get_pixmap(dpi=180)
	img_p = os.path.join(tempfile.gettempdir(), f"{Path(pdf_path).stem}_{i}.png")
	pix.save(img_p)
	imgs.append(img_p)
	doc.close()
	return imgs

	def _ppt_to_imgs(self, ppt_path):
	tmp = tempfile.mkdtemp()
	subprocess.run(
	["libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmp, ppt_path],
	check=True,
	)
	pdf_path = os.path.join(tmp, Path(ppt_path).stem + ".pdf")
	return self._pdf_to_imgs(pdf_path)

	def _files_to_content(self, media):
	out = []
	for f in media or []:
	ext = Path(f).suffix.lower()
	if ext in [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".m4v"]:
	out.append({"type": "video_url", "video_url": {"url": upload_to_s3(f)}})
	elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
	out.append({"type": "image_url", "image_url": {"url": upload_to_s3(f)}})
	elif ext in [".ppt", ".pptx"]:
	for p in self._ppt_to_imgs(f):
	out.append({"type": "image_url", "image_url": {"url": upload_to_s3(p)}})
	elif ext == ".pdf":
	for p in self._pdf_to_imgs(f):
	out.append({"type": "image_url", "image_url": {"url": upload_to_s3(p)}})
	return out

	def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = True):
	think_html = ""
	answer_md = ""

	if reasoning_content and not skip_think:
	reasoning_content_clean = reasoning_content.strip()
	think_html = (
	"### 💭 Thinking\n"
	"<details open>\n"
	"<summary>Click to expand</summary>\n\n"
	f"{reasoning_content_clean}\n"
	"</details>\n"
	)

	if content:
	answer_md = content.strip()

	return think_html + "\n\n" + answer_md


	def _build_messages(self, raw_hist, sys_prompt):
	msgs = []
	if sys_prompt.strip():
	msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
	for h in raw_hist:
	if h["role"] == "user":
	msgs.append({"role": "user", "content": h["content"]})
	else:
	raw = re.sub(r"<details.*?</details>", "", h["content"], flags=re.DOTALL)
	clean_content = self._strip_html(raw).strip()
	if clean_content:
	msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)})
	return msgs

	def stream_generate(self, raw_hist, sys_prompt: str, *, skip_special_tokens: bool = False):
	global stop_generation
	stop_generation = False
	msgs = self._build_messages(raw_hist, sys_prompt)
	reasoning_buffer = ""
	content_buffer = ""

	try:
	for delta in stream_from_vllm(msgs):
	if stop_generation:
	break
	if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
	reasoning_buffer += delta.reasoning_content
	elif hasattr(delta, 'content') and delta.content:
	content_buffer += delta.content
	else:
	if isinstance(delta, dict):
	if 'reasoning_content' in delta and delta['reasoning_content']:
	reasoning_buffer += delta['reasoning_content']
	if 'content' in delta and delta['content']:
	content_buffer += delta['content']
	elif hasattr(delta, 'content') and delta.content:
	content_buffer += delta.content

	yield self._stream_fragment(reasoning_buffer, content_buffer)

	except Exception as e:
	error_msg = f"Error during streaming: {str(e)}"
	yield self._stream_fragment("", error_msg)


	glm4v = GLM4VModel()

	sys_prompt = """Instructions:
	Extract only "BILL OF METERIAL" table containing columns same as it is!
	colums: (POSITION, DESCRIPTION, N PIECES, MATERIAL (like SA 516 Gr.70N or SA 105 N), DIMENSIONS(like 1700 I.D. X 2045H 50 THK.), WT.Kgs

	Ignore title blocks, revision notes, drawing numbers, and general annotations outside the "BILL OF METERIAL".
	If a page contains multiple tables, extract only those explicitly related to BILL OF METERIAL.

	Preserve the row and column's order and structure as it is!
	Do not include any surrounding decorative lines or borders—only.
	give clean tabular data.
	output format: markdown table format with following columns (POSITION, DESCRIPTION, N PIECES, MATERIAL, DIMENSIONS(like 1700 I.D. X 2045H 50 THK.) and WT.Kgs)"""

	def extract_table_from_file(file):
	if file is None:
	return "Please upload a file."

	payload = glm4v._files_to_content([file.name])
	raw_hist = [{"role": "user", "content": payload}]

	full_response = ""
	yield "<h2>🌀 Processing...</h2>\n"
	try:
	for chunk in glm4v.stream_generate(raw_hist, sys_prompt):
	full_response = chunk
	yield full_response
	except Exception as e:
	yield f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"

	theme = gr.themes.Ocean(
	primary_hue="gray",
	)

	with gr.Blocks(title="demo", theme=theme) as demo:
	gr.Markdown(
	"<div style='text-align:center; margin-bottom:20px;'><h1> PDF Extraction Demo</h1></div"
	)
	with gr.Row():
	with gr.Column():
	up = gr.File(label="Upload File", type="filepath")
	format_selector = gr.Radio(choices=["CSV", "JSON"], label="Output Format", value="CSV")
	submit_btn = gr.Button("Submit", variant="primary")
	with gr.Column():
	output_markdown = gr.Markdown(label="Extracted Table")

	submit_btn.click(
	extract_table_from_file,
	inputs=[up],
	outputs=[output_markdown],

	)

	if __name__ == "__main__":
	demo.launch()