Spaces:

manasdhir04
/

Image_generation

Sleeping

App Files Files Community

Image_generation / doc_anal.py

manasdhir

minor changes

5d1cbd9 9 months ago

raw

history blame contribute delete

6.44 kB

	import logging
	import requests
	import time
	from typing import Union, Dict
	from config import key, endpoint, version


	class DocumentIntelligenceService:
	"""
	A service class for interacting with Azure Document Intelligence API.
	This class provides methods to analyze documents using Azure's Document Intelligence service.
	"""

	def __init__(self):
	"""
	Initialize the DocumentIntelligenceService with API credentials and endpoint.
	"""
	self.key = key
	self.endpoint = endpoint
	self.api_version = version # Currently only available in East US, West US2, and West Europe

	def analyze(
	self,
	source: Union[str, bytes],
	is_url: bool = True,
	model_id: str = "prebuilt-layout",
	) -> Dict:
	"""
	Analyze a document using Azure Document Intelligence.
	Args:
	source (Union[str, bytes]): The document source, either a URL or base64 encoded content.
	is_url (bool): True if the source is a URL, False if it's base64 encoded content.
	model_id (str): The ID of the model to use for analysis.
	Returns:
	Dict: The analysis results.
	Raises:
	requests.HTTPError: If the API request fails.
	"""
	result_id = self._submit_analysis(source, is_url, model_id)
	return self._get_analysis_results(result_id, model_id)

	def _submit_analysis(
	self, source: Union[str, bytes], is_url: bool, model_id: str
	) -> str:
	"""
	Submit a document for analysis to Azure Document Intelligence.
	Args:
	source (Union[str, bytes]): The document source, either a URL or base64 encoded content.
	is_url (bool): True if the source is a URL, False if it's base64 encoded content.
	model_id (str): The ID of the model to use for analysis.
	Returns:
	str: The result ID for the submitted analysis.
	Raises:
	ValueError: If the Operation-Location header is missing in the response.
	requests.HTTPError: If the API request fails.
	"""
	url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}:analyze?api-version={self.api_version}&outputContentFormat=markdown"
	headers = {
	"Content-Type": "application/json",
	"Ocp-Apim-Subscription-Key": self.key,
	}
	data = {"urlSource": source} if is_url else {"base64Source": source}

	logging.info("Submitting document for analysis")
	response = requests.post(url, headers=headers, json=data)
	response.raise_for_status()

	operation_location = response.headers.get("Operation-Location")
	if not operation_location:
	raise ValueError("Operation-Location header is missing in the response.")

	return operation_location.split("/")[-1].split("?")[0]

	def _get_analysis_results(self, result_id: str, model_id: str) -> Dict:
	"""
	Retrieve the analysis results from Azure Document Intelligence.
	Args:
	result_id (str): The ID of the analysis result to retrieve.
	model_id (str): The ID of the model used for analysis.
	Returns:
	Dict: The analysis results.
	Raises:
	requests.HTTPError: If the API request fails.
	"""
	url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}/analyzeResults/{result_id}?api-version={self.api_version}&outputContentFormat=markdown"
	headers = {"Ocp-Apim-Subscription-Key": self.key}

	while True:
	logging.info("Waiting for analysis to complete.")
	time.sleep(2)
	response = requests.get(url, headers=headers)
	response.raise_for_status()
	data = response.json()

	if data.get("status") in ["succeeded", "failed"]:
	return data


	# if __name__ == "__main__":
	# # Example usage of the DocumentIntelligenceService
	# client = DocumentIntelligenceService()
	# analysis_results = client.analyze(
	# source="https://hackrx.blob.core.windows.net/assets/policy.pdf?sv=2023-01-03&st=2025-07-04T09%3A11%3A24Z&se=2027-07-05T09%3A11%3A00Z&sr=b&sp=r&sig=N4a9OU0w0QXO6AOIBiu4bpl7AXvEZogeT%2FjUHNO7HzQ%3D"
	# )
	# print(analysis_results.keys())
	# print(analysis_results["analyzeResult"].keys())
	# print(analysis_results["analyzeResult"]["content"])
	# print(analysis_results["analyzeResult"]["tables"])

	import json

	if __name__ == "__main__":
	# Example usage of the DocumentIntelligenceService
	client = DocumentIntelligenceService()
	import time
	t1=time.time()
	analysis_results = client.analyze(
	source="https://hackrx.blob.core.windows.net/assets/principia_newton.pdf?sv=2023-01-03&st=2025-07-28T07%3A20%3A32Z&se=2026-07-29T07%3A20%3A00Z&sr=b&sp=r&sig=V5I1QYyigoxeUMbnUKsdEaST99F5%2FDfo7wpKg9XXF5w%3D"
	)
	t2=time.time()
	print("analysis_time",t2-t1)
	# Write the full JSON result to a file
	with open("analysis_output.json", "w", encoding="utf-8") as f_json:
	json.dump(analysis_results, f_json, indent=4, ensure_ascii=False)

	# Write just the list of top-level keys to a text file
	with open("analysis_keys.txt", "w", encoding="utf-8") as f_keys:
	keys = analysis_results.keys()
	f_keys.write("Top-level keys in analysis_results:\n")
	for key in keys:
	f_keys.write(f"{key}\n")

	# Write keys inside analyzeResult
	analyze_result = analysis_results.get("analyzeResult", {})
	with open("analyze_result_keys.txt", "w", encoding="utf-8") as f_ar_keys:
	f_ar_keys.write("Keys inside 'analyzeResult':\n")
	for key in analyze_result.keys():
	f_ar_keys.write(f"{key}\n")

	# Write the content text to a file
	content = analyze_result.get("content", "")
	with open("content2.txt", "w", encoding="utf-8") as f_content:
	f_content.write(content)

	# Write a JSON dump of the tables extracted (if any)
	tables = analyze_result.get("tables", [])
	with open("tables.json", "w", encoding="utf-8") as f_tables:
	json.dump(tables, f_tables, indent=4, ensure_ascii=False)

	print("Analysis results have been saved to files:")
	print("- analysis_output.json")
	print("- analysis_keys.txt")
	print("- analyze_result_keys.txt")
	print("- content.txt")
	print("- tables.json")