New_Final_Assignment

Sleeping

App Files Files Community

naman1102 commited on Jun 2, 2025

Commit

133d76b

1 Parent(s): cfc12ce

ocr

Browse files

Files changed (2) hide show

apt.txt +0 -2
tools.py +84 -22

apt.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- tesseract-ocr
2	- libtesseract-dev

tools.py CHANGED Viewed

@@ -88,47 +88,109 @@ def web_search_tool(state: AgentState) -> AgentState:
     }
 def ocr_image_tool(state: AgentState) -> AgentState:
     """
-    Expects state["ocr_path"] to be either:
-      • A real local image path (e.g. "./hf_files/abc.png"), or
-      • A Task ID string like "abc123", in which case we GET /files/abc123.
     Returns:
-      { "ocr_path": None, "ocr_result": "<OCRed text or error string>" }
-    Always attempts to download the file for the given path or task ID.
     """
     print("reached ocr_image_tool")
-    # path_or_id = state.get("ocr_path", "")
-    # if not path_or_id:
-    #     return {}
-    # Always attempt to download the file, regardless of local existence
     local_img = ""
-    for ext in ("png", "jpg", "jpeg"):
-        candidate = _download_file_for_task(state.get("task_id"), ext)
-        if candidate:
-            local_img = candidate
-            break
     if not local_img or not os.path.exists(local_img):
         return {
             "ocr_path": None,
-            "ocr_result": "Error: No image file found (download failed)."
         }
-    # Run OCR
     try:
-        img = Image.open(local_img)
-        text = pytesseract.image_to_string(img).strip() or "(no visible text)"
     except Exception as e:
-        text = f"Error during OCR: {e}"
-    print(f"OCRed as ocr_result: {text}")
     return {
         "ocr_path": None,
-        "ocr_result": text
     }
 def parse_excel_tool(state: AgentState) -> AgentState:
     """
     Expects state["excel_path"] to be either:

     }
 def ocr_image_tool(state: AgentState) -> AgentState:
     """
+    Expects: state["ocr_path"] is either:
+      • a local image path (e.g. "./hf_files/abc.png"), OR
+      • a Task ID (e.g. "abc123"), in which case we try downloading
+        GET {DEFAULT_API_URL}/files/{task_id} with .png/.jpg/.jpeg extensions.
     Returns:
+      {
+        "ocr_path": None,
+        "ocr_result": "<OCR text + brief caption or an error message>"
+      }
     """
     print("reached ocr_image_tool")
+    path_or_id = state.get("ocr_path", "")
+    if not path_or_id:
+        return {}
+    # 1) Determine local_img: either existing path_or_id or download by Task ID
     local_img = ""
+    if os.path.exists(path_or_id):
+        local_img = path_or_id
+    else:
+        for ext in ("png", "jpg", "jpeg"):
+            candidate = _download_file_for_task(path_or_id, ext)
+            if candidate:
+                local_img = candidate
+                break
     if not local_img or not os.path.exists(local_img):
         return {
             "ocr_path": None,
+            "ocr_result": "Error: No image file found (local nonexistent or download failed)."
+        }
+    # 2) Read raw bytes
+    try:
+        with open(local_img, "rb") as f:
+            image_bytes = f.read()
+    except Exception as e:
+        return {
+            "ocr_path": None,
+            "ocr_result": f"Error reading image file: {e}"
+        }
+    # 3) Prepare HF Inference headers
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        return {
+            "ocr_path": None,
+            "ocr_result": "Error: HUGGINGFACE_API_KEY not set in environment."
         }
+    headers = {"Authorization": f"Bearer {hf_token}"}
+    # 4) Call HF’s vision-ocr to extract text
+    ocr_text = ""
+    try:
+        ocr_resp = requests.post(
+            "https://api-inference.huggingface.co/models/google/vit-ocr",
+            headers=headers,
+            files={"file": image_bytes},
+            timeout=30
+        )
+        ocr_resp.raise_for_status()
+        ocr_json = ocr_resp.json()
+        # The JSON has “pages” → list of blocks → “lines” → each line has “text”
+        lines = []
+        for page in ocr_json.get("pages", []):
+            for line in page.get("lines", []):
+                lines.append(line.get("text", "").strip())
+        ocr_text = "\n".join(lines).strip() or "(no visible text)"
+    except Exception as e:
+        ocr_text = f"Error during HF OCR: {e}"
+    # 5) Call HF’s image-captioning to get a brief description
+    caption = ""
     try:
+        cap_resp = requests.post(
+            "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base",
+            headers=headers,
+            files={"file": image_bytes},
+            timeout=30
+        )
+        cap_resp.raise_for_status()
+        cap_json = cap_resp.json()
+        # The response looks like: {"generated_text": "...caption..."}
+        caption = cap_json.get("generated_text", "").strip()
+        if not caption:
+            caption = "(no caption returned)"
     except Exception as e:
+        caption = f"Error during HF captioning: {e}"
+    # 6) Combine OCR + caption
+    combined = f"OCR text:\n{ocr_text}\n\nImage caption:\n{caption}"
     return {
         "ocr_path": None,
+        "ocr_result": combined
     }
 def parse_excel_tool(state: AgentState) -> AgentState:
     """
     Expects state["excel_path"] to be either: