Spaces:
Sleeping
Sleeping
| """ | |
| src/doc_qa.py | |
| ============= | |
| Pipeline QA basado en el PDF de Northwind Traders. | |
| El texto completo del PDF se inyecta en el prompt del sistema. | |
| El documento es el reporte de Promociones y Beneficios FY2025. | |
| """ | |
| from pathlib import Path | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_groq import ChatGroq | |
| from pypdf import PdfReader | |
| _SYSTEM_PROMPT = """\ | |
| You are the virtual assistant for Northwind Traders, a company specialized in | |
| importing and exporting fine foods and specialty beverages. | |
| You have access to the following internal HR document: | |
| Employee Promotions & Benefits Program — FY2025 | |
| Document content: | |
| ====================== | |
| {context} | |
| ====================== | |
| Instructions: | |
| - Answer ONLY based on information in the document above. | |
| - If the information is not in the document, say clearly: | |
| "This information is not available in the Northwind document." | |
| - Always answer in Spanish, in a precise and professional tone. | |
| - Quote specific numbers, dates, and names from the document when relevant. | |
| """ | |
| def load_pdf(pdf_path: str) -> str: | |
| """ | |
| Extrae el texto completo del PDF. | |
| Args: | |
| pdf_path: Ruta al archivo PDF. | |
| Returns: | |
| Texto extraido como string. | |
| Raises: | |
| FileNotFoundError: Si el archivo no existe. | |
| """ | |
| path = Path(pdf_path) | |
| if not path.exists(): | |
| raise FileNotFoundError(f"PDF not found: {pdf_path}") | |
| reader = PdfReader(str(path)) | |
| pages = [page.extract_text() or "" for page in reader.pages] | |
| return "\n\n".join(pages) | |
| def build_qa_chain(context: str, llm: ChatGroq): | |
| """ | |
| Construye la chain QA con el contexto del PDF inyectado. | |
| Args: | |
| context: Texto completo del PDF. | |
| llm: Instancia de ChatGroq. | |
| Returns: | |
| Chain lista para invocar con {"question": ...}. | |
| """ | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", _SYSTEM_PROMPT.format(context=context)), | |
| ("human", "{question}"), | |
| ]) | |
| return prompt | llm | StrOutputParser() | |
| def answer_from_doc(question: str, chain) -> str: | |
| return chain.invoke({"question": question}) |