MakPr016 commited on
Commit
1d09b8f
·
0 Parent(s):
Files changed (6) hide show
  1. .gitignore +2 -0
  2. Dockerfile +19 -0
  3. README.md +16 -0
  4. main.py +191 -0
  5. matcher.py +66 -0
  6. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ master_index.json
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV HOME=/home/user \
6
+ PATH=/home/user/.local/bin:$PATH
7
+
8
+ WORKDIR $HOME/app
9
+
10
+ COPY --chown=user requirements.txt .
11
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
12
+
13
+ # Copy the rest of the app
14
+ COPY --chown=user . .
15
+
16
+ # Ensure the data directory exists and has permissions
17
+ RUN mkdir -p data/uploaded
18
+
19
+ CMD ["uvicorn main:app --host 0.0.0.0 --port 7860"]
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Medicine Parser API
3
+ emoji: 💊
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ app_port: 7860
8
+ ---
9
+
10
+ ## Medicine Parser & Vendor Matcher
11
+
12
+ This is a FastAPI application deployed via Docker.
13
+
14
+ - **Upload:** POST `/api/upload`
15
+ - **Parse:** POST `/api/parse/{id}`
16
+ - **Match:** POST `/api/match-all`
main.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import uuid
4
+ import math
5
+ import re
6
+ import shutil
7
+ import asyncio
8
+ import pdfplumber
9
+ from typing import List, Dict, Optional, Any
10
+ from fastapi import FastAPI, HTTPException, UploadFile, File, BackgroundTasks
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from pydantic import BaseModel
13
+
14
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
15
+ DATA_DIR = os.path.join(BASE_DIR, 'data', 'uploaded')
16
+ MASTER_INDEX_PATH = os.path.join(BASE_DIR, 'data', 'master_index.json')
17
+
18
+ os.makedirs(DATA_DIR, exist_ok=True)
19
+
20
+ _MASTER_INDEX_CACHE = {}
21
+
22
+ def load_master_index():
23
+ global _MASTER_INDEX_CACHE
24
+ if _MASTER_INDEX_CACHE: return _MASTER_INDEX_CACHE
25
+ if os.path.exists(MASTER_INDEX_PATH):
26
+ with open(MASTER_INDEX_PATH, 'r', encoding='utf-8') as f:
27
+ _MASTER_INDEX_CACHE = json.load(f)
28
+ return _MASTER_INDEX_CACHE
29
+
30
+ def clean_text(text: Optional[str]) -> str:
31
+ return text.replace('\n', ' ').strip() if text else ""
32
+
33
+ def is_garbage_row(row_text: str) -> bool:
34
+ blacklist = [
35
+ "click or tap",
36
+ "enter text",
37
+ "rfq reference",
38
+ "signature",
39
+ "date:",
40
+ "authorized by",
41
+ "page ",
42
+ "payment terms"
43
+ ]
44
+ t = row_text.lower()
45
+ return any(bad in t for bad in blacklist)
46
+
47
+ async def delete_file_safety_net(file_path: str, delay: int = 600):
48
+ await asyncio.sleep(delay)
49
+ try:
50
+ if os.path.exists(file_path):
51
+ os.remove(file_path)
52
+ except Exception:
53
+ pass
54
+
55
+ def parse_pdf_file(file_path: str) -> List[Dict[str, Any]]:
56
+ extracted_items = []
57
+ with pdfplumber.open(file_path) as pdf:
58
+ for page in pdf.pages:
59
+ tables = page.extract_tables()
60
+ for table in tables:
61
+ for row in table:
62
+ cleaned_row = [clean_text(cell) for cell in row if cell is not None and clean_text(cell) != ""]
63
+ if not cleaned_row: continue
64
+
65
+ row_text = " ".join(cleaned_row)
66
+ if is_garbage_row(row_text): continue
67
+ if "description" in row_text.lower() and "qty" in row_text.lower(): continue
68
+
69
+ try:
70
+ qty = 1
71
+ qty_idx = -1
72
+ for i in range(len(cleaned_row) - 1, -1, -1):
73
+ val = cleaned_row[i].replace(',', '').replace('.', '')
74
+ if val.isdigit() and int(val) < 1000000:
75
+ qty = int(val)
76
+ qty_idx = i
77
+ break
78
+
79
+ if qty_idx == -1: continue
80
+
81
+ desc_idx = 0
82
+ if re.match(r'^\d+\.?$', cleaned_row[0]) and len(cleaned_row) > 1:
83
+ desc_idx = 1
84
+
85
+ description = cleaned_row[desc_idx]
86
+ if re.match(r'^\d+$', description): continue
87
+ if is_garbage_row(description): continue
88
+
89
+ unit = "Unit"
90
+ if qty_idx > 0 and qty_idx > desc_idx:
91
+ potential_unit = cleaned_row[qty_idx - 1]
92
+ if len(potential_unit) < 20 and potential_unit != description:
93
+ unit = potential_unit
94
+
95
+ extracted_items.append({
96
+ "inn_name": description,
97
+ "quantity": qty,
98
+ "form": unit,
99
+ "dosage": ""
100
+ })
101
+ except Exception:
102
+ continue
103
+ return extracted_items
104
+
105
+ app = FastAPI()
106
+
107
+ app.add_middleware(
108
+ CORSMiddleware,
109
+ allow_origins=["*"],
110
+ allow_methods=["*"],
111
+ allow_headers=["*"],
112
+ )
113
+
114
+ @app.on_event("startup")
115
+ def startup():
116
+ load_master_index()
117
+
118
+ @app.post("/api/upload")
119
+ async def upload_document(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
120
+ doc_id = str(uuid.uuid4())
121
+ filename = f"{doc_id}.pdf"
122
+ file_path = os.path.join(DATA_DIR, filename)
123
+
124
+ with open(file_path, "wb") as buffer:
125
+ shutil.copyfileobj(file.file, buffer)
126
+
127
+ background_tasks.add_task(delete_file_safety_net, file_path, 600)
128
+
129
+ return {"document_id": doc_id, "message": "Upload successful"}
130
+
131
+ @app.post("/api/parse/{document_id}")
132
+ async def parse_document(document_id: str):
133
+ file_path = os.path.join(DATA_DIR, f"{document_id}.pdf")
134
+ if not os.path.exists(file_path):
135
+ raise HTTPException(status_code=404, detail="File not found")
136
+
137
+ try:
138
+ items = parse_pdf_file(file_path)
139
+ if os.path.exists(file_path):
140
+ os.remove(file_path)
141
+ return {
142
+ "document_id": document_id,
143
+ "data": { "line_items": items }
144
+ }
145
+ except Exception as e:
146
+ if os.path.exists(file_path):
147
+ os.remove(file_path)
148
+ raise HTTPException(status_code=500, detail="Parsing failed")
149
+
150
+ class MatchRequest(BaseModel):
151
+ items: List[Dict[str, Any]]
152
+ preferences: List[str] = []
153
+
154
+ @app.post("/api/match-all")
155
+ async def match_all(req: MatchRequest):
156
+ index = load_master_index()
157
+ vendors = index.get('vendors', [])
158
+ results = []
159
+
160
+ for item in req.items:
161
+ name = item.get('inn_name') or 'Unknown'
162
+ qty = int(item.get('quantity', 1))
163
+
164
+ matches = []
165
+ for v in vendors:
166
+ cats = [c.lower() for c in v.get('primary_categories', [])]
167
+ if 'pharmaceuticals' in cats or 'medical devices' in cats:
168
+ matches.append({
169
+ 'vendor_id': v.get('vendor_id'),
170
+ 'name': v.get('legal_name'),
171
+ 'country': (v.get('countries_served') or ['Unknown'])[0],
172
+ 'landedCost': v.get('landedCost', 10),
173
+ 'deliveryDays': v.get('deliveryDays', 5),
174
+ 'availableQty': v.get('availableQty', 1000),
175
+ 'qualityScore': v.get('confidence_score', 80) / 10.0,
176
+ 'reliabilityScore': 5,
177
+ 'score': 9.5
178
+ })
179
+
180
+ results.append({
181
+ "medicine": name,
182
+ "quantity": qty,
183
+ "top_vendor": matches[0] if matches else None,
184
+ "other_vendors": matches[1:5] if len(matches) > 1 else []
185
+ })
186
+
187
+ return {"matches": results}
188
+
189
+ if __name__ == "__main__":
190
+ import uvicorn
191
+ uvicorn.run(app, host="127.0.0.1", port=5001)
matcher.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import List, Dict, Any
4
+
5
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
6
+ DATA_DIR = os.path.join(BASE_DIR, 'data', 'extracted')
7
+ MASTER_INDEX_PATH = os.path.join(BASE_DIR, 'data', 'master_index.json')
8
+
9
+ # Global cache variable
10
+ _MASTER_INDEX_CACHE = []
11
+
12
+ def init_master_index():
13
+ """Loads the index into the global cache variable."""
14
+ global _MASTER_INDEX_CACHE
15
+ if os.path.exists(MASTER_INDEX_PATH):
16
+ with open(MASTER_INDEX_PATH, 'r') as f:
17
+ _MASTER_INDEX_CACHE = json.load(f)
18
+ print(f"Loaded {len(_MASTER_INDEX_CACHE)} vendors into memory.")
19
+ else:
20
+ print("Warning: master_index.json not found.")
21
+ _MASTER_INDEX_CACHE = []
22
+
23
+ def get_master_index():
24
+ """Returns the cached index."""
25
+ return _MASTER_INDEX_CACHE
26
+
27
+ PRESET_WEIGHTS = {
28
+ 'resource-saving': {'quantity': 0.1, 'cost': 0.5, 'delivery': 0.1, 'quality': 0.1, 'reliability': 0.2},
29
+ 'time': {'quantity': 0.1, 'cost': 0.1, 'delivery': 0.5, 'quality': 0.1, 'reliability': 0.2},
30
+ 'quality': {'quantity': 0.1, 'cost': 0.1, 'delivery': 0.1, 'quality': 0.5, 'reliability': 0.2},
31
+ 'quantity': {'quantity': 0.5, 'cost': 0.1, 'delivery': 0.1, 'quality': 0.1, 'reliability': 0.2},
32
+ 'default': {'quantity': 0.2, 'cost': 0.2, 'delivery': 0.2, 'quality': 0.2, 'reliability': 0.2}
33
+ }
34
+
35
+ def score_vendors(vendors: List[Dict], target_qty: int, preferences: List[str]):
36
+ w = PRESET_WEIGHTS['default'].copy()
37
+
38
+ # Improved preference merging (average all selected)
39
+ if preferences:
40
+ active_weights = [PRESET_WEIGHTS.get(p, PRESET_WEIGHTS['default']) for p in preferences]
41
+ if active_weights:
42
+ for key in w:
43
+ w[key] = sum(aw[key] for aw in active_weights) / len(active_weights)
44
+
45
+ scored = []
46
+ for v in vendors:
47
+ # Protect against ZeroDivisionError
48
+ s_qty = min(1.0, v.get('availableQty', 0) / target_qty) if target_qty > 0 else 0
49
+ s_cost = 1.0 / (1.0 + (v.get('landedCost', 0) / 100))
50
+ s_delivery = 1.0 / (1.0 + (v.get('deliveryDays', 0) / 7))
51
+ s_quality = v.get('qualityScore', 0) / 10.0
52
+ s_reliability = v.get('reliabilityScore', 0) / 10.0
53
+
54
+ final_score = (
55
+ w['quantity'] * s_qty +
56
+ w['cost'] * s_cost +
57
+ w['delivery'] * s_delivery +
58
+ w['quality'] * s_quality +
59
+ w['reliability'] * s_reliability
60
+ )
61
+
62
+ v_copy = v.copy()
63
+ v_copy['score'] = round(final_score * 10, 2)
64
+ scored.append(v_copy)
65
+
66
+ return sorted(scored, key=lambda x: x['score'], reverse=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi>=0.100.0
2
+ uvicorn[standard]>=0.23.0
3
+ pydantic>=2.0.0
4
+ python-multipart>=0.0.6
5
+ pdfplumber>=0.10.0