Mummia-99 commited on
Commit
826bb40
·
verified ·
1 Parent(s): 305672a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -6
app.py CHANGED
@@ -12,7 +12,23 @@ import os
12
  load_dotenv()
13
 
14
  # Configure Google Generative AI API
15
- genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def get_response(model, user_input, image, prompt):
18
  """Generate response from the model using input and image data."""
@@ -39,7 +55,7 @@ if uploaded_pdf:
39
  with st.spinner("Converting PDF to images..."):
40
  images = convert_pdf_to_images(uploaded_pdf.read())
41
 
42
- st.image(images[0], caption="Page 1 of PDF", use_container_width=True)
43
 
44
  if st.button("Extract Table from Invoice"):
45
  with st.spinner("Extracting data with Gemini..."):
@@ -71,28 +87,42 @@ Format strictly as JSON array of dictionaries.
71
  """
72
 
73
  all_data = []
 
74
 
75
  try:
76
  for i, image in enumerate(images):
77
  response_text = get_response(model, prompt, image, prompt)
78
  st.success(f"✅ Gemini responded for page {i+1}!")
79
 
80
- # Parse JSON
81
  start_index = response_text.find('[')
82
  end_index = response_text.rfind(']') + 1
83
  clean_json = response_text[start_index:end_index]
84
  data = json.loads(clean_json)
 
85
  for row in data:
86
- row["Page"] = i + 1 # Add page number for traceability
 
 
 
 
87
  all_data.extend(data)
88
 
89
- if all_data:
 
 
 
 
 
 
90
  df = pd.DataFrame(all_data)
91
  if "CODE ARTICLE" in df.columns:
92
  df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")]
93
  st.dataframe(df)
94
 
95
- # Create downloadable Excel
 
 
96
  output = io.BytesIO()
97
  with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
98
  df.to_excel(writer, index=False, sheet_name="Invoice Data")
 
12
  load_dotenv()
13
 
14
  # Configure Google Generative AI API
15
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY", "your-api-key-here"))
16
+
17
+ # File to track already processed invoices
18
+ PROCESSED_FILE = "processed_invoices.json"
19
+
20
+ def load_processed_invoices():
21
+ if os.path.exists(PROCESSED_FILE):
22
+ with open(PROCESSED_FILE, "r") as f:
23
+ return json.load(f)
24
+ return []
25
+
26
+ def save_processed_invoice(invoice_number):
27
+ processed = load_processed_invoices()
28
+ if invoice_number not in processed:
29
+ processed.append(invoice_number)
30
+ with open(PROCESSED_FILE, "w") as f:
31
+ json.dump(processed, f)
32
 
33
  def get_response(model, user_input, image, prompt):
34
  """Generate response from the model using input and image data."""
 
55
  with st.spinner("Converting PDF to images..."):
56
  images = convert_pdf_to_images(uploaded_pdf.read())
57
 
58
+ st.image(images[0], caption="Page 1 of PDF", use_column_width=True)
59
 
60
  if st.button("Extract Table from Invoice"):
61
  with st.spinner("Extracting data with Gemini..."):
 
87
  """
88
 
89
  all_data = []
90
+ invoice_numbers_found = set()
91
 
92
  try:
93
  for i, image in enumerate(images):
94
  response_text = get_response(model, prompt, image, prompt)
95
  st.success(f"✅ Gemini responded for page {i+1}!")
96
 
97
+ # Parse response text to extract JSON
98
  start_index = response_text.find('[')
99
  end_index = response_text.rfind(']') + 1
100
  clean_json = response_text[start_index:end_index]
101
  data = json.loads(clean_json)
102
+
103
  for row in data:
104
+ row["Page"] = i + 1
105
+ invoice_id = row.get("N° FACTURE") or row.get("Avoir")
106
+ if invoice_id:
107
+ invoice_numbers_found.add(invoice_id.strip())
108
+
109
  all_data.extend(data)
110
 
111
+ # Check if already processed
112
+ processed = load_processed_invoices()
113
+ duplicate_invoices = [inv for inv in invoice_numbers_found if inv in processed]
114
+
115
+ if duplicate_invoices:
116
+ st.warning(f"⚠️ Invoice(s) {', '.join(duplicate_invoices)} have already been analyzed. Skipping processing.")
117
+ elif all_data:
118
  df = pd.DataFrame(all_data)
119
  if "CODE ARTICLE" in df.columns:
120
  df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")]
121
  st.dataframe(df)
122
 
123
+ for inv in invoice_numbers_found:
124
+ save_processed_invoice(inv)
125
+
126
  output = io.BytesIO()
127
  with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
128
  df.to_excel(writer, index=False, sheet_name="Invoice Data")