Sahil Garg commited on
Commit
cf1a853
·
1 Parent(s): d7eb7ae

user given BOQ implemented

Browse files
Files changed (2) hide show
  1. services/boq_extractor.py +28 -8
  2. streamlit_app.py +20 -3
services/boq_extractor.py CHANGED
@@ -115,7 +115,7 @@ class BOQExtractor:
115
 
116
  return '|'.join(parts[:9])
117
 
118
- def _extract_from_batch(self, batch_text: str, batch_num: int, previous_boq: Optional[str] = None) -> List[str]:
119
  """
120
  Extract BOQ items from a single batch.
121
 
@@ -123,15 +123,30 @@ class BOQExtractor:
123
  batch_text: Combined text from batch chunks.
124
  batch_num: Batch number for logging.
125
  previous_boq: Previous BOQ output for improvement (optional).
 
 
126
 
127
  Returns:
128
  List of BOQ item strings.
129
  """
130
  prompt_text = batch_text[:self.max_prompt_length]
 
 
 
 
 
 
 
131
  if previous_boq:
132
- prompt = BOQ_IMPROVEMENT_TEMPLATE.format(previous_boq=previous_boq, batch_text=prompt_text)
 
 
 
133
  else:
134
- prompt = BOQ_EXTRACTION_TEMPLATE.format(batch_text=prompt_text)
 
 
 
135
 
136
  try:
137
  logger.info(f'Invoking LLM for BOQ extraction on batch {batch_num}...')
@@ -268,7 +283,7 @@ No BOQ items were found in this document.'''
268
 
269
  return formatted_boq
270
 
271
- def extract(self, chunks: List[Document], vector_store: FAISS = None, previous_boq: Optional[str] = None) -> str:
272
  """
273
  Extract BOQ from document chunks.
274
 
@@ -276,6 +291,8 @@ No BOQ items were found in this document.'''
276
  chunks: List of Document chunks.
277
  vector_store: Optional vector store (not used currently).
278
  previous_boq: Previous BOQ output for improvement (optional).
 
 
279
 
280
  Returns:
281
  Formatted BOQ output as markdown string.
@@ -301,7 +318,7 @@ No BOQ items were found in this document.'''
301
  batch_text = '\n\n'.join(chunk_texts)
302
  logger.info(f'Batch text length: {len(batch_text)}')
303
 
304
- batch_items = self._extract_from_batch(batch_text, batch_num, previous_boq)
305
  boq_items.extend(batch_items)
306
  logger.info(f'Batch {batch_num} yielded {len(batch_items)} items')
307
 
@@ -320,7 +337,7 @@ No BOQ items were found in this document.'''
320
  logger.error(f'Error in comprehensive BOQ extraction: {e}')
321
  raise
322
 
323
- def extract_iterative(self, chunks: List[Document], vector_store: FAISS, runs: int) -> Tuple[str, List[str]]:
324
  """
325
  Extract BOQ iteratively, improving with each run.
326
 
@@ -328,10 +345,13 @@ No BOQ items were found in this document.'''
328
  chunks: Document chunks.
329
  vector_store: Vector store.
330
  runs: Number of runs (1-5).
 
 
331
 
332
  Returns:
333
  Tuple of (final_boq, list_of_all_outputs).
334
  """
 
335
  all_outputs = []
336
  previous_boq = None
337
 
@@ -339,7 +359,7 @@ No BOQ items were found in this document.'''
339
  if runs == 1:
340
  logger.info('Starting single BOQ extraction (runs=1)')
341
  try:
342
- current_output = self.extract(chunks, vector_store, previous_boq)
343
  logger.info('Single extraction completed')
344
  except Exception as e:
345
  logger.error(f'Single extraction failed: {e}')
@@ -351,7 +371,7 @@ No BOQ items were found in this document.'''
351
  for run in range(runs):
352
  try:
353
  logger.info(f'Starting iterative run {run + 1}/{runs}')
354
- current_output = self.extract(chunks, vector_store, previous_boq)
355
  logger.info(f'Iterative run {run + 1} completed')
356
  except Exception as e:
357
  logger.warning(f'Iterative run {run + 1} failed: {e}, using previous output')
 
115
 
116
  return '|'.join(parts[:9])
117
 
118
+ def _extract_from_batch(self, batch_text: str, batch_num: int, previous_boq: Optional[str] = None, boq_mode: list = None, specific_boq: str = None) -> List[str]:
119
  """
120
  Extract BOQ items from a single batch.
121
 
 
123
  batch_text: Combined text from batch chunks.
124
  batch_num: Batch number for logging.
125
  previous_boq: Previous BOQ output for improvement (optional).
126
+ boq_mode: List of modes ["default", "specific BOQ"].
127
+ specific_boq: Specific BOQ string if mode includes "specific BOQ".
128
 
129
  Returns:
130
  List of BOQ item strings.
131
  """
132
  prompt_text = batch_text[:self.max_prompt_length]
133
+
134
+ # Determine extraction instruction based on mode
135
+ if boq_mode == ["specific BOQ"] and specific_boq:
136
+ extraction_instruction = f"Extract only BOQ items that are related to or match the following: {specific_boq}. If no matching items are found, return NO_BOQ_ITEMS."
137
+ else:
138
+ extraction_instruction = "Extract all BOQ items present in the text."
139
+
140
  if previous_boq:
141
+ base_prompt = BOQ_IMPROVEMENT_TEMPLATE
142
+ # Insert instruction before "Now, analyze this text"
143
+ base_prompt = base_prompt.replace("Now, analyze this text and extract improved BOQ line items.", f"{extraction_instruction}\n\nNow, analyze this text and extract improved BOQ line items.")
144
+ prompt = base_prompt.format(previous_boq=previous_boq, batch_text=prompt_text)
145
  else:
146
+ base_prompt = BOQ_EXTRACTION_TEMPLATE
147
+ # Insert instruction before "Text to analyze:"
148
+ base_prompt = base_prompt.replace("Text to analyze:", f"{extraction_instruction}\n\nText to analyze:")
149
+ prompt = base_prompt.format(batch_text=prompt_text)
150
 
151
  try:
152
  logger.info(f'Invoking LLM for BOQ extraction on batch {batch_num}...')
 
283
 
284
  return formatted_boq
285
 
286
+ def extract(self, chunks: List[Document], vector_store: FAISS = None, previous_boq: Optional[str] = None, boq_mode: list = None, specific_boq: str = None) -> str:
287
  """
288
  Extract BOQ from document chunks.
289
 
 
291
  chunks: List of Document chunks.
292
  vector_store: Optional vector store (not used currently).
293
  previous_boq: Previous BOQ output for improvement (optional).
294
+ boq_mode: List of modes ["default", "specific BOQ"].
295
+ specific_boq: Specific BOQ string if mode includes "specific BOQ".
296
 
297
  Returns:
298
  Formatted BOQ output as markdown string.
 
318
  batch_text = '\n\n'.join(chunk_texts)
319
  logger.info(f'Batch text length: {len(batch_text)}')
320
 
321
+ batch_items = self._extract_from_batch(batch_text, batch_num, previous_boq, boq_mode, specific_boq)
322
  boq_items.extend(batch_items)
323
  logger.info(f'Batch {batch_num} yielded {len(batch_items)} items')
324
 
 
337
  logger.error(f'Error in comprehensive BOQ extraction: {e}')
338
  raise
339
 
340
+ def extract_iterative(self, chunks: List[Document], vector_store: FAISS, runs: int, boq_mode: list = None, specific_boq: str = None) -> Tuple[str, List[str]]:
341
  """
342
  Extract BOQ iteratively, improving with each run.
343
 
 
345
  chunks: Document chunks.
346
  vector_store: Vector store.
347
  runs: Number of runs (1-5).
348
+ boq_mode: List of modes ["default", "specific BOQ"].
349
+ specific_boq: Specific BOQ string if mode includes "specific BOQ".
350
 
351
  Returns:
352
  Tuple of (final_boq, list_of_all_outputs).
353
  """
354
+ logger.info(f'BOQ extraction mode: {boq_mode}, Specific BOQ: {specific_boq}')
355
  all_outputs = []
356
  previous_boq = None
357
 
 
359
  if runs == 1:
360
  logger.info('Starting single BOQ extraction (runs=1)')
361
  try:
362
+ current_output = self.extract(chunks, vector_store, previous_boq, boq_mode, specific_boq)
363
  logger.info('Single extraction completed')
364
  except Exception as e:
365
  logger.error(f'Single extraction failed: {e}')
 
371
  for run in range(runs):
372
  try:
373
  logger.info(f'Starting iterative run {run + 1}/{runs}')
374
+ current_output = self.extract(chunks, vector_store, previous_boq, boq_mode, specific_boq)
375
  logger.info(f'Iterative run {run + 1} completed')
376
  except Exception as e:
377
  logger.warning(f'Iterative run {run + 1} failed: {e}, using previous output')
streamlit_app.py CHANGED
@@ -63,13 +63,15 @@ def initialize_session_state():
63
  st.session_state[key] = value
64
 
65
 
66
- def process_pdf(uploaded_file, runs: int) -> bool:
67
  """
68
  Process uploaded PDF file.
69
 
70
  Args:
71
  uploaded_file: Streamlit uploaded file
72
  runs: Number of extraction runs
 
 
73
 
74
  Returns:
75
  True if processing succeeded, False otherwise.
@@ -97,7 +99,7 @@ def process_pdf(uploaded_file, runs: int) -> bool:
97
 
98
  # Extract BOQ iteratively
99
  st.info(f"Extracting BOQ items ({runs} runs)...")
100
- final_boq, all_outputs = st.session_state.boq_extractor.extract_iterative(chunks, vector_store, runs)
101
 
102
  # Compute consistency
103
  consistency = st.session_state.consistency_checker.check_from_outputs(all_outputs)
@@ -290,9 +292,24 @@ def render_sidebar():
290
  help="Choose extraction quality. Higher runs improve accuracy but take longer to process."
291
  )
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  if uploaded_file and api_key:
294
  if st.button("🚀 Process Document"):
295
- process_pdf(uploaded_file, runs)
296
  elif uploaded_file and not api_key:
297
  st.error("Please enter API key first.")
298
 
 
63
  st.session_state[key] = value
64
 
65
 
66
+ def process_pdf(uploaded_file, runs: int, boq_mode: list, specific_boq: str) -> bool:
67
  """
68
  Process uploaded PDF file.
69
 
70
  Args:
71
  uploaded_file: Streamlit uploaded file
72
  runs: Number of extraction runs
73
+ boq_mode: List of BOQ modes ["default", "specific BOQ"]
74
+ specific_boq: Specific BOQ string if applicable
75
 
76
  Returns:
77
  True if processing succeeded, False otherwise.
 
99
 
100
  # Extract BOQ iteratively
101
  st.info(f"Extracting BOQ items ({runs} runs)...")
102
+ final_boq, all_outputs = st.session_state.boq_extractor.extract_iterative(chunks, vector_store, runs, boq_mode, specific_boq)
103
 
104
  # Compute consistency
105
  consistency = st.session_state.consistency_checker.check_from_outputs(all_outputs)
 
292
  help="Choose extraction quality. Higher runs improve accuracy but take longer to process."
293
  )
294
 
295
+ # BOQ Mode selection
296
+ boq_mode = st.multiselect(
297
+ "BOQ Extraction Mode",
298
+ options=["default", "specific BOQ"],
299
+ default=["default"],
300
+ help="Select 'default' for all BOQ items, 'specific BOQ' to extract only a particular BOQ item."
301
+ )
302
+
303
+ specific_boq = None
304
+ if "specific BOQ" in boq_mode:
305
+ specific_boq = st.text_input(
306
+ "Specific BOQ",
307
+ help="Enter the name or description of the specific BOQ item to extract."
308
+ )
309
+
310
  if uploaded_file and api_key:
311
  if st.button("🚀 Process Document"):
312
+ process_pdf(uploaded_file, runs, boq_mode, specific_boq)
313
  elif uploaded_file and not api_key:
314
  st.error("Please enter API key first.")
315