lfoppiano commited on
Commit
667de45
·
verified ·
1 Parent(s): fef35ce

Upload folder using huggingface_hub

Browse files
document_qa/document_qa_engine.py CHANGED
@@ -34,7 +34,7 @@ class TextMerger:
34
 
35
  Args:
36
  model_name: A tiktoken model name (e.g. ``"gpt-4"``). When given,
37
- the tokenizer for that model is used.
38
  encoding_name: A tiktoken encoding name (default ``"gpt2"``).
39
  Ignored when *model_name* is provided.
40
  """
@@ -174,7 +174,7 @@ class DataStorage:
174
 
175
  Args:
176
  embedding_function: A LangChain-compatible ``Embeddings`` instance
177
- root_path: Optional directory for persisted embeddings.
178
  engine: The vector-store class to use.
179
 
180
  """
@@ -278,7 +278,7 @@ class DocumentQAEngine:
278
  Args:
279
  llm: A LangChain chat model (e.g. ``ChatOpenAI``).
280
  data_storage: A `DataStorage` instance for managing embeddings.
281
- grobid_url: URL of the GROBID server.
282
  memory: Optional ``ConversationBufferMemory`` for multi-turn context.
283
 
284
  """
@@ -297,7 +297,8 @@ class DocumentQAEngine:
297
  llm,
298
  data_storage: DataStorage,
299
  grobid_url=None,
300
- memory=None
 
301
  ):
302
 
303
  self.llm = llm
@@ -307,7 +308,7 @@ class DocumentQAEngine:
307
  self.data_storage = data_storage
308
 
309
  if grobid_url:
310
- self.grobid_processor = GrobidProcessor(grobid_url)
311
 
312
  def query_document(
313
  self,
@@ -317,7 +318,7 @@ class DocumentQAEngine:
317
  context_size=4,
318
  extraction_schema=None,
319
  verbose=False
320
- ) -> tuple[Any, str]:
321
  """Ask a question and get an LLM-generated answer.
322
 
323
  Retrieves the most relevant chunks from the vector store, feeds
@@ -354,7 +355,7 @@ class DocumentQAEngine:
354
 
355
  if output_parser:
356
  try:
357
- return self._parse_json(response, output_parser), response
358
  except Exception as oe:
359
  print("Failing to parse the response", oe)
360
  return None, response, coordinates
@@ -369,7 +370,7 @@ class DocumentQAEngine:
369
  else:
370
  return None, response, coordinates
371
 
372
- def query_storage(self, query: str, doc_id, context_size=4) -> tuple[List[Document], list]:
373
  """Retrieve relevant text passages without calling the LLM.
374
 
375
  Useful for debugging which chunks would be used as context, or for
@@ -480,7 +481,7 @@ class DocumentQAEngine:
480
 
481
  return parsed_output
482
 
483
- def _run_query(self, doc_id, query, context_size=4) -> tuple[List[Document], list]:
484
  relevant_documents, relevant_document_coordinates = self._get_context(doc_id, query, context_size)
485
  response = self.chain.invoke({"context": relevant_documents, "question": query})
486
  return response, relevant_document_coordinates
@@ -550,7 +551,7 @@ class DocumentQAEngine:
550
  biblio['filename'] = filename.replace(" ", "_")
551
 
552
  if verbose:
553
- print("Generating embeddings for:", hash, ", filename: ", filename)
554
 
555
  texts = []
556
  metadatas = []
 
34
 
35
  Args:
36
  model_name: A tiktoken model name (e.g. ``"gpt-4"``). When given,
37
+ the tokenizer for that model is used.
38
  encoding_name: A tiktoken encoding name (default ``"gpt2"``).
39
  Ignored when *model_name* is provided.
40
  """
 
174
 
175
  Args:
176
  embedding_function: A LangChain-compatible ``Embeddings`` instance
177
+ root_path: Optional directory for persisted embeddings.
178
  engine: The vector-store class to use.
179
 
180
  """
 
278
  Args:
279
  llm: A LangChain chat model (e.g. ``ChatOpenAI``).
280
  data_storage: A `DataStorage` instance for managing embeddings.
281
+ grobid_url: URL of the GROBID server.
282
  memory: Optional ``ConversationBufferMemory`` for multi-turn context.
283
 
284
  """
 
297
  llm,
298
  data_storage: DataStorage,
299
  grobid_url=None,
300
+ memory=None,
301
+ ping_grobid_server: bool = True
302
  ):
303
 
304
  self.llm = llm
 
308
  self.data_storage = data_storage
309
 
310
  if grobid_url:
311
+ self.grobid_processor = GrobidProcessor(grobid_url, ping_server=ping_grobid_server)
312
 
313
  def query_document(
314
  self,
 
318
  context_size=4,
319
  extraction_schema=None,
320
  verbose=False
321
+ ) -> tuple[Any, str, list]:
322
  """Ask a question and get an LLM-generated answer.
323
 
324
  Retrieves the most relevant chunks from the vector store, feeds
 
355
 
356
  if output_parser:
357
  try:
358
+ return self._parse_json(response, output_parser), response, coordinates
359
  except Exception as oe:
360
  print("Failing to parse the response", oe)
361
  return None, response, coordinates
 
370
  else:
371
  return None, response, coordinates
372
 
373
+ def query_storage(self, query: str, doc_id, context_size=4) -> tuple[List[str], list]:
374
  """Retrieve relevant text passages without calling the LLM.
375
 
376
  Useful for debugging which chunks would be used as context, or for
 
481
 
482
  return parsed_output
483
 
484
+ def _run_query(self, doc_id, query, context_size=4) -> tuple[Any, list]:
485
  relevant_documents, relevant_document_coordinates = self._get_context(doc_id, query, context_size)
486
  response = self.chain.invoke({"context": relevant_documents, "question": query})
487
  return response, relevant_document_coordinates
 
551
  biblio['filename'] = filename.replace(" ", "_")
552
 
553
  if verbose:
554
+ print("Generating embeddings for filename: ", filename)
555
 
556
  texts = []
557
  metadatas = []
document_qa/grobid_processors.py CHANGED
@@ -20,10 +20,19 @@ from pathlib import Path
20
 
21
  import dateparser
22
  import grobid_tei_xml
 
23
  from bs4 import BeautifulSoup
24
  from grobid_client.grobid_client import GrobidClient
25
 
26
 
 
 
 
 
 
 
 
 
27
  def get_span_start(type, title=None):
28
  """Return an opening ``<span>`` tag for an annotation of the given *type*."""
29
  title_ = ' title="' + title + '"' if title is not None else ""
@@ -168,22 +177,57 @@ class GrobidProcessor(BaseProcessor):
168
 
169
  Returns ``None`` if GROBID returns a non-200 status.
170
  """
171
- pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
172
- input_path,
173
- consolidate_header=True,
174
- consolidate_citations=False,
175
- segment_sentences=False,
176
- tei_coordinates=coordinates,
177
- include_raw_citations=False,
178
- include_raw_affiliations=False,
179
- generateIDs=True)
 
 
 
 
 
 
180
 
181
  if status != 200:
182
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- document_object = self.parse_grobid_xml(text, coordinates=coordinates)
185
  document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
186
 
 
 
 
 
 
 
 
 
187
  return document_object
188
 
189
  def process_single(self, input_file):
@@ -221,7 +265,7 @@ class GrobidProcessor(BaseProcessor):
221
  try:
222
  year = dateparser.parse(doc_biblio.header.date).year
223
  biblio["publication_year"] = year
224
- except:
225
  pass
226
 
227
  output_data['biblio'] = biblio
 
20
 
21
  import dateparser
22
  import grobid_tei_xml
23
+ import requests
24
  from bs4 import BeautifulSoup
25
  from grobid_client.grobid_client import GrobidClient
26
 
27
 
28
+ class GrobidServiceError(RuntimeError):
29
+ """Raised when the Grobid service fails to process a document."""
30
+
31
+ def __init__(self, message="Grobid service error", status_code=None):
32
+ super().__init__(message)
33
+ self.status_code = status_code
34
+
35
+
36
  def get_span_start(type, title=None):
37
  """Return an opening ``<span>`` tag for an annotation of the given *type*."""
38
  title_ = ' title="' + title + '"' if title is not None else ""
 
177
 
178
  Returns ``None`` if GROBID returns a non-200 status.
179
  """
180
+ try:
181
+ pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
182
+ input_path,
183
+ consolidate_header=True,
184
+ consolidate_citations=False,
185
+ segment_sentences=False,
186
+ tei_coordinates=coordinates,
187
+ include_raw_citations=False,
188
+ include_raw_affiliations=False,
189
+ generateIDs=True)
190
+ except requests.exceptions.RequestException as exc:
191
+ # Transport-level failure (connection refused, timeout, …).
192
+ # Local/usage errors (bad path, parsing bugs) are intentionally
193
+ # not caught here so they surface with their real traceback.
194
+ raise GrobidServiceError("Grobid service did not respond.") from exc
195
 
196
  if status != 200:
197
+ raise GrobidServiceError(
198
+ f"Grobid service returned status {status}.",
199
+ status_code=status
200
+ )
201
+
202
+ # Grobid can answer 200 with an empty body (e.g. it gave up on the PDF).
203
+ if not text or not text.strip():
204
+ raise GrobidServiceError(
205
+ "Grobid returned an empty response.",
206
+ status_code=status
207
+ )
208
+
209
+ # A truncated/corrupted TEI payload makes the XML parser blow up; map
210
+ # that to a clear service error instead of an opaque parsing traceback.
211
+ try:
212
+ document_object = self.parse_grobid_xml(text, coordinates=coordinates)
213
+ except GrobidServiceError:
214
+ raise
215
+ except Exception as exc:
216
+ raise GrobidServiceError(
217
+ "Grobid returned a malformed or truncated response.",
218
+ status_code=status
219
+ ) from exc
220
 
 
221
  document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
222
 
223
+ # Well-formed XML can still carry no usable text (e.g. an image-only or
224
+ # truncated PDF). Nothing to embed downstream, so fail loudly here.
225
+ if not any(passage.get('text', '').strip() for passage in document_object.get('passages', [])):
226
+ raise GrobidServiceError(
227
+ "Grobid returned a document with no extractable text.",
228
+ status_code=status
229
+ )
230
+
231
  return document_object
232
 
233
  def process_single(self, input_file):
 
265
  try:
266
  year = dateparser.parse(doc_biblio.header.date).year
267
  biblio["publication_year"] = year
268
+ except Exception:
269
  pass
270
 
271
  output_data['biblio'] = biblio
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  # Grobid
2
  grobid-quantities-client==0.4.0
3
- grobid-client-python==0.0.9
4
  grobid-tei-xml==0.1.3
5
 
6
  # Utils
@@ -30,6 +30,6 @@ typing-inspect==0.9.0
30
  typing_extensions==4.12.2
31
  pydantic==2.10.6
32
  sentence-transformers==2.6.1
33
- streamlit-pdf-viewer==0.0.25
34
  umap-learn==0.5.6
35
  plotly==5.20.0
 
1
  # Grobid
2
  grobid-quantities-client==0.4.0
3
+ grobid-client-python==0.1.4
4
  grobid-tei-xml==0.1.3
5
 
6
  # Utils
 
30
  typing_extensions==4.12.2
31
  pydantic==2.10.6
32
  sentence-transformers==2.6.1
33
+ streamlit-pdf-viewer==0.0.29
34
  umap-learn==0.5.6
35
  plotly==5.20.0
streamlit_app.py CHANGED
@@ -15,20 +15,19 @@ from hashlib import blake2b
15
  from tempfile import NamedTemporaryFile
16
 
17
  import dotenv
 
18
  from grobid_quantities.quantities import QuantitiesAPI
19
  from langchain.memory import ConversationBufferMemory
20
  from langchain_openai import ChatOpenAI
21
  from streamlit_pdf_viewer import pdf_viewer
22
 
23
  from document_qa.custom_embeddings import ModalEmbeddings
 
 
24
  from document_qa.ner_client_generic import NERClientGeneric
25
 
26
  dotenv.load_dotenv(override=True)
27
 
28
- import streamlit as st
29
- from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
30
- from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
31
-
32
  API_MODELS = {
33
  "microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
34
  "Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
@@ -169,7 +168,13 @@ def init_qa(model_name, embeddings_name):
169
  )
170
 
171
  storage = DataStorage(embeddings)
172
- return DocumentQAEngine(chat, storage, grobid_url=os.environ['GROBID_URL'], memory=st.session_state['memory'])
 
 
 
 
 
 
173
 
174
 
175
  @st.cache_resource
@@ -358,19 +363,36 @@ if uploaded_file and not st.session_state.loaded_embeddings:
358
  st.stop()
359
 
360
  with left_column:
361
- with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
362
- binary = uploaded_file.getvalue()
363
- tmp_file = NamedTemporaryFile()
364
- tmp_file.write(bytearray(binary))
365
- st.session_state['binary'] = binary
366
-
367
- st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
368
- tmp_file.name,
369
- chunk_size=chunk_size,
370
- perc_overlap=0.1
371
- )
372
- st.session_state['loaded_embeddings'] = True
373
- st.session_state.messages = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
 
376
  def rgb_to_hex(rgb):
 
15
  from tempfile import NamedTemporaryFile
16
 
17
  import dotenv
18
+ import streamlit as st
19
  from grobid_quantities.quantities import QuantitiesAPI
20
  from langchain.memory import ConversationBufferMemory
21
  from langchain_openai import ChatOpenAI
22
  from streamlit_pdf_viewer import pdf_viewer
23
 
24
  from document_qa.custom_embeddings import ModalEmbeddings
25
+ from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
26
+ from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations, GrobidServiceError
27
  from document_qa.ner_client_generic import NERClientGeneric
28
 
29
  dotenv.load_dotenv(override=True)
30
 
 
 
 
 
31
  API_MODELS = {
32
  "microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
33
  "Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
 
168
  )
169
 
170
  storage = DataStorage(embeddings)
171
+ return DocumentQAEngine(
172
+ chat,
173
+ storage,
174
+ grobid_url=os.environ['GROBID_URL'],
175
+ memory=st.session_state['memory'],
176
+ ping_grobid_server=False
177
+ )
178
 
179
 
180
  @st.cache_resource
 
363
  st.stop()
364
 
365
  with left_column:
366
+ try:
367
+ with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
368
+ binary = uploaded_file.getvalue()
369
+ tmp_path = None
370
+ try:
371
+ with NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
372
+ tmp_file.write(bytearray(binary))
373
+ tmp_file.flush()
374
+ tmp_path = tmp_file.name
375
+ st.session_state['binary'] = binary
376
+
377
+ st.session_state['doc_id'] = st.session_state['rqa'][model].create_memory_embeddings(
378
+ tmp_path,
379
+ chunk_size=chunk_size,
380
+ perc_overlap=0.1
381
+ )
382
+ finally:
383
+ if tmp_path and os.path.exists(tmp_path):
384
+ os.unlink(tmp_path)
385
+ st.session_state['loaded_embeddings'] = True
386
+ st.session_state.messages = []
387
+ except GrobidServiceError as exc:
388
+ st.session_state['doc_id'] = None
389
+ st.session_state['loaded_embeddings'] = False
390
+ st.session_state['uploaded'] = False
391
+ message = str(exc).strip() or "Grobid is not responding."
392
+ if not message.endswith((".", "!", "?")):
393
+ message += "."
394
+ st.error(f"{message} Please try again later.")
395
+ st.stop()
396
 
397
 
398
  def rgb_to_hex(rgb):