Spaces:

MakPr016
/

parse-api

Running

App Files Files Community

MakPr016 commited on 12 days ago

Commit

1d09b8f

0 Parent(s):

Setup

Browse files

Files changed (6) hide show

.gitignore +2 -0
Dockerfile +19 -0
README.md +16 -0
main.py +191 -0
matcher.py +66 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ master_index.json

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM python:3.10-slim
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy the rest of the app
+COPY --chown=user . .
+# Ensure the data directory exists and has permissions
+RUN mkdir -p data/uploaded
+CMD ["uvicorn main:app --host 0.0.0.0 --port 7860"]

README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+---
+title: Medicine Parser API
+emoji: 💊
+colorFrom: blue
+colorTo: green
+sdk: docker
+app_port: 7860
+---
+## Medicine Parser & Vendor Matcher
+This is a FastAPI application deployed via Docker.
+- **Upload:** POST `/api/upload`
+- **Parse:** POST `/api/parse/{id}`
+- **Match:** POST `/api/match-all`

main.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import os
+import json
+import uuid
+import math
+import re
+import shutil
+import asyncio
+import pdfplumber
+from typing import List, Dict, Optional, Any
+from fastapi import FastAPI, HTTPException, UploadFile, File, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(BASE_DIR, 'data', 'uploaded')
+MASTER_INDEX_PATH = os.path.join(BASE_DIR, 'data', 'master_index.json')
+os.makedirs(DATA_DIR, exist_ok=True)
+_MASTER_INDEX_CACHE = {}
+def load_master_index():
+    global _MASTER_INDEX_CACHE
+    if _MASTER_INDEX_CACHE: return _MASTER_INDEX_CACHE
+    if os.path.exists(MASTER_INDEX_PATH):
+        with open(MASTER_INDEX_PATH, 'r', encoding='utf-8') as f:
+            _MASTER_INDEX_CACHE = json.load(f)
+    return _MASTER_INDEX_CACHE
+def clean_text(text: Optional[str]) -> str:
+    return text.replace('\n', ' ').strip() if text else ""
+def is_garbage_row(row_text: str) -> bool:
+    blacklist = [
+        "click or tap",
+        "enter text",
+        "rfq reference",
+        "signature",
+        "date:",
+        "authorized by",
+        "page ",
+        "payment terms"
+    ]
+    t = row_text.lower()
+    return any(bad in t for bad in blacklist)
+async def delete_file_safety_net(file_path: str, delay: int = 600):
+    await asyncio.sleep(delay)
+    try:
+        if os.path.exists(file_path):
+            os.remove(file_path)
+    except Exception:
+        pass
+def parse_pdf_file(file_path: str) -> List[Dict[str, Any]]:
+    extracted_items = []
+    with pdfplumber.open(file_path) as pdf:
+        for page in pdf.pages:
+            tables = page.extract_tables()
+            for table in tables:
+                for row in table:
+                    cleaned_row = [clean_text(cell) for cell in row if cell is not None and clean_text(cell) != ""]
+                    if not cleaned_row: continue
+                    row_text = " ".join(cleaned_row)
+                    if is_garbage_row(row_text): continue
+                    if "description" in row_text.lower() and "qty" in row_text.lower(): continue
+                    try:
+                        qty = 1
+                        qty_idx = -1
+                        for i in range(len(cleaned_row) - 1, -1, -1):
+                            val = cleaned_row[i].replace(',', '').replace('.', '')
+                            if val.isdigit() and int(val) < 1000000:
+                                qty = int(val)
+                                qty_idx = i
+                                break
+                        if qty_idx == -1: continue
+                        desc_idx = 0
+                        if re.match(r'^\d+\.?$', cleaned_row[0]) and len(cleaned_row) > 1:
+                            desc_idx = 1
+                        description = cleaned_row[desc_idx]
+                        if re.match(r'^\d+$', description): continue
+                        if is_garbage_row(description): continue
+                        unit = "Unit"
+                        if qty_idx > 0 and qty_idx > desc_idx:
+                            potential_unit = cleaned_row[qty_idx - 1]
+                            if len(potential_unit) < 20 and potential_unit != description:
+                                unit = potential_unit
+                        extracted_items.append({
+                            "inn_name": description,
+                            "quantity": qty,
+                            "form": unit,
+                            "dosage": ""
+                        })
+                    except Exception:
+                        continue
+    return extracted_items
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+def startup():
+    load_master_index()
+@app.post("/api/upload")
+async def upload_document(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
+    doc_id = str(uuid.uuid4())
+    filename = f"{doc_id}.pdf"
+    file_path = os.path.join(DATA_DIR, filename)
+    with open(file_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    background_tasks.add_task(delete_file_safety_net, file_path, 600)
+    return {"document_id": doc_id, "message": "Upload successful"}
+@app.post("/api/parse/{document_id}")
+async def parse_document(document_id: str):
+    file_path = os.path.join(DATA_DIR, f"{document_id}.pdf")
+    if not os.path.exists(file_path):
+        raise HTTPException(status_code=404, detail="File not found")
+    try:
+        items = parse_pdf_file(file_path)
+        if os.path.exists(file_path):
+            os.remove(file_path)
+        return {
+            "document_id": document_id,
+            "data": { "line_items": items }
+        }
+    except Exception as e:
+        if os.path.exists(file_path):
+            os.remove(file_path)
+        raise HTTPException(status_code=500, detail="Parsing failed")
+class MatchRequest(BaseModel):
+    items: List[Dict[str, Any]]
+    preferences: List[str] = []
+@app.post("/api/match-all")
+async def match_all(req: MatchRequest):
+    index = load_master_index()
+    vendors = index.get('vendors', [])
+    results = []
+    for item in req.items:
+        name = item.get('inn_name') or 'Unknown'
+        qty = int(item.get('quantity', 1))
+        matches = []
+        for v in vendors:
+            cats = [c.lower() for c in v.get('primary_categories', [])]
+            if 'pharmaceuticals' in cats or 'medical devices' in cats:
+                 matches.append({
+                    'vendor_id': v.get('vendor_id'),
+                    'name': v.get('legal_name'),
+                    'country': (v.get('countries_served') or ['Unknown'])[0],
+                    'landedCost': v.get('landedCost', 10),
+                    'deliveryDays': v.get('deliveryDays', 5),
+                    'availableQty': v.get('availableQty', 1000),
+                    'qualityScore': v.get('confidence_score', 80) / 10.0,
+                    'reliabilityScore': 5,
+                    'score': 9.5
+                })
+        results.append({
+            "medicine": name,
+            "quantity": qty,
+            "top_vendor": matches[0] if matches else None,
+            "other_vendors": matches[1:5] if len(matches) > 1 else []
+        })
+    return {"matches": results}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="127.0.0.1", port=5001)

matcher.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import json
+import os
+from typing import List, Dict, Any
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(BASE_DIR, 'data', 'extracted')
+MASTER_INDEX_PATH = os.path.join(BASE_DIR, 'data', 'master_index.json')
+# Global cache variable
+_MASTER_INDEX_CACHE = []
+def init_master_index():
+    """Loads the index into the global cache variable."""
+    global _MASTER_INDEX_CACHE
+    if os.path.exists(MASTER_INDEX_PATH):
+        with open(MASTER_INDEX_PATH, 'r') as f:
+            _MASTER_INDEX_CACHE = json.load(f)
+        print(f"Loaded {len(_MASTER_INDEX_CACHE)} vendors into memory.")
+    else:
+        print("Warning: master_index.json not found.")
+        _MASTER_INDEX_CACHE = []
+def get_master_index():
+    """Returns the cached index."""
+    return _MASTER_INDEX_CACHE
+PRESET_WEIGHTS = {
+    'resource-saving': {'quantity': 0.1, 'cost': 0.5, 'delivery': 0.1, 'quality': 0.1, 'reliability': 0.2},
+    'time': {'quantity': 0.1, 'cost': 0.1, 'delivery': 0.5, 'quality': 0.1, 'reliability': 0.2},
+    'quality': {'quantity': 0.1, 'cost': 0.1, 'delivery': 0.1, 'quality': 0.5, 'reliability': 0.2},
+    'quantity': {'quantity': 0.5, 'cost': 0.1, 'delivery': 0.1, 'quality': 0.1, 'reliability': 0.2},
+    'default': {'quantity': 0.2, 'cost': 0.2, 'delivery': 0.2, 'quality': 0.2, 'reliability': 0.2}
+}
+def score_vendors(vendors: List[Dict], target_qty: int, preferences: List[str]):
+    w = PRESET_WEIGHTS['default'].copy()
+    # Improved preference merging (average all selected)
+    if preferences:
+        active_weights = [PRESET_WEIGHTS.get(p, PRESET_WEIGHTS['default']) for p in preferences]
+        if active_weights:
+            for key in w:
+                w[key] = sum(aw[key] for aw in active_weights) / len(active_weights)
+    scored = []
+    for v in vendors:
+        # Protect against ZeroDivisionError
+        s_qty = min(1.0, v.get('availableQty', 0) / target_qty) if target_qty > 0 else 0
+        s_cost = 1.0 / (1.0 + (v.get('landedCost', 0) / 100))
+        s_delivery = 1.0 / (1.0 + (v.get('deliveryDays', 0) / 7))
+        s_quality = v.get('qualityScore', 0) / 10.0
+        s_reliability = v.get('reliabilityScore', 0) / 10.0
+        final_score = (
+            w['quantity'] * s_qty +
+            w['cost'] * s_cost +
+            w['delivery'] * s_delivery +
+            w['quality'] * s_quality +
+            w['reliability'] * s_reliability
+        )
+        v_copy = v.copy()
+        v_copy['score'] = round(final_score * 10, 2)
+        scored.append(v_copy)
+    return sorted(scored, key=lambda x: x['score'], reverse=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+fastapi>=0.100.0
+uvicorn[standard]>=0.23.0
+pydantic>=2.0.0
+python-multipart>=0.0.6
+pdfplumber>=0.10.0