Spaces:
Build error
Build error
Upload folder using huggingface_hub
Browse files- document_qa/document_qa_engine.py +11 -10
- document_qa/grobid_processors.py +56 -12
- requirements.txt +2 -2
- streamlit_app.py +40 -18
document_qa/document_qa_engine.py
CHANGED
|
@@ -34,7 +34,7 @@ class TextMerger:
|
|
| 34 |
|
| 35 |
Args:
|
| 36 |
model_name: A tiktoken model name (e.g. ``"gpt-4"``). When given,
|
| 37 |
-
the tokenizer for that model is used.
|
| 38 |
encoding_name: A tiktoken encoding name (default ``"gpt2"``).
|
| 39 |
Ignored when *model_name* is provided.
|
| 40 |
"""
|
|
@@ -174,7 +174,7 @@ class DataStorage:
|
|
| 174 |
|
| 175 |
Args:
|
| 176 |
embedding_function: A LangChain-compatible ``Embeddings`` instance
|
| 177 |
-
root_path: Optional directory for persisted embeddings.
|
| 178 |
engine: The vector-store class to use.
|
| 179 |
|
| 180 |
"""
|
|
@@ -278,7 +278,7 @@ class DocumentQAEngine:
|
|
| 278 |
Args:
|
| 279 |
llm: A LangChain chat model (e.g. ``ChatOpenAI``).
|
| 280 |
data_storage: A `DataStorage` instance for managing embeddings.
|
| 281 |
-
grobid_url: URL of the GROBID server.
|
| 282 |
memory: Optional ``ConversationBufferMemory`` for multi-turn context.
|
| 283 |
|
| 284 |
"""
|
|
@@ -297,7 +297,8 @@ class DocumentQAEngine:
|
|
| 297 |
llm,
|
| 298 |
data_storage: DataStorage,
|
| 299 |
grobid_url=None,
|
| 300 |
-
memory=None
|
|
|
|
| 301 |
):
|
| 302 |
|
| 303 |
self.llm = llm
|
|
@@ -307,7 +308,7 @@ class DocumentQAEngine:
|
|
| 307 |
self.data_storage = data_storage
|
| 308 |
|
| 309 |
if grobid_url:
|
| 310 |
-
self.grobid_processor = GrobidProcessor(grobid_url)
|
| 311 |
|
| 312 |
def query_document(
|
| 313 |
self,
|
|
@@ -317,7 +318,7 @@ class DocumentQAEngine:
|
|
| 317 |
context_size=4,
|
| 318 |
extraction_schema=None,
|
| 319 |
verbose=False
|
| 320 |
-
) -> tuple[Any, str]:
|
| 321 |
"""Ask a question and get an LLM-generated answer.
|
| 322 |
|
| 323 |
Retrieves the most relevant chunks from the vector store, feeds
|
|
@@ -354,7 +355,7 @@ class DocumentQAEngine:
|
|
| 354 |
|
| 355 |
if output_parser:
|
| 356 |
try:
|
| 357 |
-
return self._parse_json(response, output_parser), response
|
| 358 |
except Exception as oe:
|
| 359 |
print("Failing to parse the response", oe)
|
| 360 |
return None, response, coordinates
|
|
@@ -369,7 +370,7 @@ class DocumentQAEngine:
|
|
| 369 |
else:
|
| 370 |
return None, response, coordinates
|
| 371 |
|
| 372 |
-
def query_storage(self, query: str, doc_id, context_size=4) -> tuple[List[
|
| 373 |
"""Retrieve relevant text passages without calling the LLM.
|
| 374 |
|
| 375 |
Useful for debugging which chunks would be used as context, or for
|
|
@@ -480,7 +481,7 @@ class DocumentQAEngine:
|
|
| 480 |
|
| 481 |
return parsed_output
|
| 482 |
|
| 483 |
-
def _run_query(self, doc_id, query, context_size=4) -> tuple[
|
| 484 |
relevant_documents, relevant_document_coordinates = self._get_context(doc_id, query, context_size)
|
| 485 |
response = self.chain.invoke({"context": relevant_documents, "question": query})
|
| 486 |
return response, relevant_document_coordinates
|
|
@@ -550,7 +551,7 @@ class DocumentQAEngine:
|
|
| 550 |
biblio['filename'] = filename.replace(" ", "_")
|
| 551 |
|
| 552 |
if verbose:
|
| 553 |
-
print("Generating embeddings for
|
| 554 |
|
| 555 |
texts = []
|
| 556 |
metadatas = []
|
|
|
|
| 34 |
|
| 35 |
Args:
|
| 36 |
model_name: A tiktoken model name (e.g. ``"gpt-4"``). When given,
|
| 37 |
+
the tokenizer for that model is used.
|
| 38 |
encoding_name: A tiktoken encoding name (default ``"gpt2"``).
|
| 39 |
Ignored when *model_name* is provided.
|
| 40 |
"""
|
|
|
|
| 174 |
|
| 175 |
Args:
|
| 176 |
embedding_function: A LangChain-compatible ``Embeddings`` instance
|
| 177 |
+
root_path: Optional directory for persisted embeddings.
|
| 178 |
engine: The vector-store class to use.
|
| 179 |
|
| 180 |
"""
|
|
|
|
| 278 |
Args:
|
| 279 |
llm: A LangChain chat model (e.g. ``ChatOpenAI``).
|
| 280 |
data_storage: A `DataStorage` instance for managing embeddings.
|
| 281 |
+
grobid_url: URL of the GROBID server.
|
| 282 |
memory: Optional ``ConversationBufferMemory`` for multi-turn context.
|
| 283 |
|
| 284 |
"""
|
|
|
|
| 297 |
llm,
|
| 298 |
data_storage: DataStorage,
|
| 299 |
grobid_url=None,
|
| 300 |
+
memory=None,
|
| 301 |
+
ping_grobid_server: bool = True
|
| 302 |
):
|
| 303 |
|
| 304 |
self.llm = llm
|
|
|
|
| 308 |
self.data_storage = data_storage
|
| 309 |
|
| 310 |
if grobid_url:
|
| 311 |
+
self.grobid_processor = GrobidProcessor(grobid_url, ping_server=ping_grobid_server)
|
| 312 |
|
| 313 |
def query_document(
|
| 314 |
self,
|
|
|
|
| 318 |
context_size=4,
|
| 319 |
extraction_schema=None,
|
| 320 |
verbose=False
|
| 321 |
+
) -> tuple[Any, str, list]:
|
| 322 |
"""Ask a question and get an LLM-generated answer.
|
| 323 |
|
| 324 |
Retrieves the most relevant chunks from the vector store, feeds
|
|
|
|
| 355 |
|
| 356 |
if output_parser:
|
| 357 |
try:
|
| 358 |
+
return self._parse_json(response, output_parser), response, coordinates
|
| 359 |
except Exception as oe:
|
| 360 |
print("Failing to parse the response", oe)
|
| 361 |
return None, response, coordinates
|
|
|
|
| 370 |
else:
|
| 371 |
return None, response, coordinates
|
| 372 |
|
| 373 |
+
def query_storage(self, query: str, doc_id, context_size=4) -> tuple[List[str], list]:
|
| 374 |
"""Retrieve relevant text passages without calling the LLM.
|
| 375 |
|
| 376 |
Useful for debugging which chunks would be used as context, or for
|
|
|
|
| 481 |
|
| 482 |
return parsed_output
|
| 483 |
|
| 484 |
+
def _run_query(self, doc_id, query, context_size=4) -> tuple[Any, list]:
|
| 485 |
relevant_documents, relevant_document_coordinates = self._get_context(doc_id, query, context_size)
|
| 486 |
response = self.chain.invoke({"context": relevant_documents, "question": query})
|
| 487 |
return response, relevant_document_coordinates
|
|
|
|
| 551 |
biblio['filename'] = filename.replace(" ", "_")
|
| 552 |
|
| 553 |
if verbose:
|
| 554 |
+
print("Generating embeddings for filename: ", filename)
|
| 555 |
|
| 556 |
texts = []
|
| 557 |
metadatas = []
|
document_qa/grobid_processors.py
CHANGED
|
@@ -20,10 +20,19 @@ from pathlib import Path
|
|
| 20 |
|
| 21 |
import dateparser
|
| 22 |
import grobid_tei_xml
|
|
|
|
| 23 |
from bs4 import BeautifulSoup
|
| 24 |
from grobid_client.grobid_client import GrobidClient
|
| 25 |
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def get_span_start(type, title=None):
|
| 28 |
"""Return an opening ``<span>`` tag for an annotation of the given *type*."""
|
| 29 |
title_ = ' title="' + title + '"' if title is not None else ""
|
|
@@ -168,22 +177,57 @@ class GrobidProcessor(BaseProcessor):
|
|
| 168 |
|
| 169 |
Returns ``None`` if GROBID returns a non-200 status.
|
| 170 |
"""
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
if status != 200:
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
-
document_object = self.parse_grobid_xml(text, coordinates=coordinates)
|
| 185 |
document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
return document_object
|
| 188 |
|
| 189 |
def process_single(self, input_file):
|
|
@@ -221,7 +265,7 @@ class GrobidProcessor(BaseProcessor):
|
|
| 221 |
try:
|
| 222 |
year = dateparser.parse(doc_biblio.header.date).year
|
| 223 |
biblio["publication_year"] = year
|
| 224 |
-
except:
|
| 225 |
pass
|
| 226 |
|
| 227 |
output_data['biblio'] = biblio
|
|
|
|
| 20 |
|
| 21 |
import dateparser
|
| 22 |
import grobid_tei_xml
|
| 23 |
+
import requests
|
| 24 |
from bs4 import BeautifulSoup
|
| 25 |
from grobid_client.grobid_client import GrobidClient
|
| 26 |
|
| 27 |
|
| 28 |
+
class GrobidServiceError(RuntimeError):
|
| 29 |
+
"""Raised when the Grobid service fails to process a document."""
|
| 30 |
+
|
| 31 |
+
def __init__(self, message="Grobid service error", status_code=None):
|
| 32 |
+
super().__init__(message)
|
| 33 |
+
self.status_code = status_code
|
| 34 |
+
|
| 35 |
+
|
| 36 |
def get_span_start(type, title=None):
|
| 37 |
"""Return an opening ``<span>`` tag for an annotation of the given *type*."""
|
| 38 |
title_ = ' title="' + title + '"' if title is not None else ""
|
|
|
|
| 177 |
|
| 178 |
Returns ``None`` if GROBID returns a non-200 status.
|
| 179 |
"""
|
| 180 |
+
try:
|
| 181 |
+
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
|
| 182 |
+
input_path,
|
| 183 |
+
consolidate_header=True,
|
| 184 |
+
consolidate_citations=False,
|
| 185 |
+
segment_sentences=False,
|
| 186 |
+
tei_coordinates=coordinates,
|
| 187 |
+
include_raw_citations=False,
|
| 188 |
+
include_raw_affiliations=False,
|
| 189 |
+
generateIDs=True)
|
| 190 |
+
except requests.exceptions.RequestException as exc:
|
| 191 |
+
# Transport-level failure (connection refused, timeout, …).
|
| 192 |
+
# Local/usage errors (bad path, parsing bugs) are intentionally
|
| 193 |
+
# not caught here so they surface with their real traceback.
|
| 194 |
+
raise GrobidServiceError("Grobid service did not respond.") from exc
|
| 195 |
|
| 196 |
if status != 200:
|
| 197 |
+
raise GrobidServiceError(
|
| 198 |
+
f"Grobid service returned status {status}.",
|
| 199 |
+
status_code=status
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
# Grobid can answer 200 with an empty body (e.g. it gave up on the PDF).
|
| 203 |
+
if not text or not text.strip():
|
| 204 |
+
raise GrobidServiceError(
|
| 205 |
+
"Grobid returned an empty response.",
|
| 206 |
+
status_code=status
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
# A truncated/corrupted TEI payload makes the XML parser blow up; map
|
| 210 |
+
# that to a clear service error instead of an opaque parsing traceback.
|
| 211 |
+
try:
|
| 212 |
+
document_object = self.parse_grobid_xml(text, coordinates=coordinates)
|
| 213 |
+
except GrobidServiceError:
|
| 214 |
+
raise
|
| 215 |
+
except Exception as exc:
|
| 216 |
+
raise GrobidServiceError(
|
| 217 |
+
"Grobid returned a malformed or truncated response.",
|
| 218 |
+
status_code=status
|
| 219 |
+
) from exc
|
| 220 |
|
|
|
|
| 221 |
document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
|
| 222 |
|
| 223 |
+
# Well-formed XML can still carry no usable text (e.g. an image-only or
|
| 224 |
+
# truncated PDF). Nothing to embed downstream, so fail loudly here.
|
| 225 |
+
if not any(passage.get('text', '').strip() for passage in document_object.get('passages', [])):
|
| 226 |
+
raise GrobidServiceError(
|
| 227 |
+
"Grobid returned a document with no extractable text.",
|
| 228 |
+
status_code=status
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
return document_object
|
| 232 |
|
| 233 |
def process_single(self, input_file):
|
|
|
|
| 265 |
try:
|
| 266 |
year = dateparser.parse(doc_biblio.header.date).year
|
| 267 |
biblio["publication_year"] = year
|
| 268 |
+
except Exception:
|
| 269 |
pass
|
| 270 |
|
| 271 |
output_data['biblio'] = biblio
|
requirements.txt
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# Grobid
|
| 2 |
grobid-quantities-client==0.4.0
|
| 3 |
-
grobid-client-python==0.
|
| 4 |
grobid-tei-xml==0.1.3
|
| 5 |
|
| 6 |
# Utils
|
|
@@ -30,6 +30,6 @@ typing-inspect==0.9.0
|
|
| 30 |
typing_extensions==4.12.2
|
| 31 |
pydantic==2.10.6
|
| 32 |
sentence-transformers==2.6.1
|
| 33 |
-
streamlit-pdf-viewer==0.0.
|
| 34 |
umap-learn==0.5.6
|
| 35 |
plotly==5.20.0
|
|
|
|
| 1 |
# Grobid
|
| 2 |
grobid-quantities-client==0.4.0
|
| 3 |
+
grobid-client-python==0.1.4
|
| 4 |
grobid-tei-xml==0.1.3
|
| 5 |
|
| 6 |
# Utils
|
|
|
|
| 30 |
typing_extensions==4.12.2
|
| 31 |
pydantic==2.10.6
|
| 32 |
sentence-transformers==2.6.1
|
| 33 |
+
streamlit-pdf-viewer==0.0.29
|
| 34 |
umap-learn==0.5.6
|
| 35 |
plotly==5.20.0
|
streamlit_app.py
CHANGED
|
@@ -15,20 +15,19 @@ from hashlib import blake2b
|
|
| 15 |
from tempfile import NamedTemporaryFile
|
| 16 |
|
| 17 |
import dotenv
|
|
|
|
| 18 |
from grobid_quantities.quantities import QuantitiesAPI
|
| 19 |
from langchain.memory import ConversationBufferMemory
|
| 20 |
from langchain_openai import ChatOpenAI
|
| 21 |
from streamlit_pdf_viewer import pdf_viewer
|
| 22 |
|
| 23 |
from document_qa.custom_embeddings import ModalEmbeddings
|
|
|
|
|
|
|
| 24 |
from document_qa.ner_client_generic import NERClientGeneric
|
| 25 |
|
| 26 |
dotenv.load_dotenv(override=True)
|
| 27 |
|
| 28 |
-
import streamlit as st
|
| 29 |
-
from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
|
| 30 |
-
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
|
| 31 |
-
|
| 32 |
API_MODELS = {
|
| 33 |
"microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
|
| 34 |
"Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
|
|
@@ -169,7 +168,13 @@ def init_qa(model_name, embeddings_name):
|
|
| 169 |
)
|
| 170 |
|
| 171 |
storage = DataStorage(embeddings)
|
| 172 |
-
return DocumentQAEngine(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
|
| 175 |
@st.cache_resource
|
|
@@ -358,19 +363,36 @@ if uploaded_file and not st.session_state.loaded_embeddings:
|
|
| 358 |
st.stop()
|
| 359 |
|
| 360 |
with left_column:
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
|
| 376 |
def rgb_to_hex(rgb):
|
|
|
|
| 15 |
from tempfile import NamedTemporaryFile
|
| 16 |
|
| 17 |
import dotenv
|
| 18 |
+
import streamlit as st
|
| 19 |
from grobid_quantities.quantities import QuantitiesAPI
|
| 20 |
from langchain.memory import ConversationBufferMemory
|
| 21 |
from langchain_openai import ChatOpenAI
|
| 22 |
from streamlit_pdf_viewer import pdf_viewer
|
| 23 |
|
| 24 |
from document_qa.custom_embeddings import ModalEmbeddings
|
| 25 |
+
from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
|
| 26 |
+
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations, GrobidServiceError
|
| 27 |
from document_qa.ner_client_generic import NERClientGeneric
|
| 28 |
|
| 29 |
dotenv.load_dotenv(override=True)
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
API_MODELS = {
|
| 32 |
"microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
|
| 33 |
"Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
|
|
|
|
| 168 |
)
|
| 169 |
|
| 170 |
storage = DataStorage(embeddings)
|
| 171 |
+
return DocumentQAEngine(
|
| 172 |
+
chat,
|
| 173 |
+
storage,
|
| 174 |
+
grobid_url=os.environ['GROBID_URL'],
|
| 175 |
+
memory=st.session_state['memory'],
|
| 176 |
+
ping_grobid_server=False
|
| 177 |
+
)
|
| 178 |
|
| 179 |
|
| 180 |
@st.cache_resource
|
|
|
|
| 363 |
st.stop()
|
| 364 |
|
| 365 |
with left_column:
|
| 366 |
+
try:
|
| 367 |
+
with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
|
| 368 |
+
binary = uploaded_file.getvalue()
|
| 369 |
+
tmp_path = None
|
| 370 |
+
try:
|
| 371 |
+
with NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
|
| 372 |
+
tmp_file.write(bytearray(binary))
|
| 373 |
+
tmp_file.flush()
|
| 374 |
+
tmp_path = tmp_file.name
|
| 375 |
+
st.session_state['binary'] = binary
|
| 376 |
+
|
| 377 |
+
st.session_state['doc_id'] = st.session_state['rqa'][model].create_memory_embeddings(
|
| 378 |
+
tmp_path,
|
| 379 |
+
chunk_size=chunk_size,
|
| 380 |
+
perc_overlap=0.1
|
| 381 |
+
)
|
| 382 |
+
finally:
|
| 383 |
+
if tmp_path and os.path.exists(tmp_path):
|
| 384 |
+
os.unlink(tmp_path)
|
| 385 |
+
st.session_state['loaded_embeddings'] = True
|
| 386 |
+
st.session_state.messages = []
|
| 387 |
+
except GrobidServiceError as exc:
|
| 388 |
+
st.session_state['doc_id'] = None
|
| 389 |
+
st.session_state['loaded_embeddings'] = False
|
| 390 |
+
st.session_state['uploaded'] = False
|
| 391 |
+
message = str(exc).strip() or "Grobid is not responding."
|
| 392 |
+
if not message.endswith((".", "!", "?")):
|
| 393 |
+
message += "."
|
| 394 |
+
st.error(f"{message} Please try again later.")
|
| 395 |
+
st.stop()
|
| 396 |
|
| 397 |
|
| 398 |
def rgb_to_hex(rgb):
|