import os from fastapi import FastAPI from pydantic import BaseModel from typing import List from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline from langchain_community.cross_encoders import HuggingFaceCrossEncoder from langchain_community.vectorstores import FAISS from langchain.retrievers import ContextualCompressionRetriever from langchain.retrievers.document_compressors import CrossEncoderReranker from langchain.prompts import PromptTemplate from langchain.chains import RetrievalQA from transformers import pipeline from huggingface_hub import login login(os.environ["HF_TOKEN"]) # ----------------------------- # Prompt (forces concise output) # ----------------------------- QA_PROMPT = PromptTemplate( template="""Answer the following question in a short and concise way (maximum 20 sentences), using only the information from the context below. If you don’t know the answer, just say "I cant assist you". Context: {context} Question: {question} Concise Answer:""", input_variables=["context", "question"], ) # ----------------------------- # Load and process documents # ----------------------------- pdfs = ["ejemplo2.pdf"] docs = [] for pdf in pdfs: docs.extend(PyPDFLoader(pdf).load()) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) splits = text_splitter.split_documents(docs) embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2") vectorstore = FAISS.from_documents(splits, embeddings) retriever = vectorstore.as_retriever(search_kwargs={"k": 10}) cross_encoder = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base") compressor = CrossEncoderReranker(model=cross_encoder, top_n=5) compression_retriever = ContextualCompressionRetriever( base_retriever=retriever, base_compressor=compressor ) # ----------------------------- # Configure FLAN-T5 (better task) # ----------------------------- generator = pipeline( "text2text-generation", # 👈 Use this for T5 models model="google/flan-t5-base", max_new_tokens=512, # shorter answers temperature=5, repetition_penalty=1.1 ) llm = HuggingFacePipeline(pipeline=generator) # ----------------------------- # RetrievalQA # ----------------------------- qa_chain = RetrievalQA.from_chain_type( llm=llm, retriever=compression_retriever, return_source_documents=True, chain_type_kwargs={"prompt": QA_PROMPT} ) # ----------------------------- # FastAPI app # ----------------------------- app = FastAPI(title="PDF QA API", description="Query PDFs with RAG + HuggingFace") class QueryRequest(BaseModel): query: str class QueryResponse(BaseModel): answer: str sources: List[str] @app.post("/ask", response_model=QueryResponse) def ask_question(request: QueryRequest): result = qa_chain.invoke({"query": request.query}) return QueryResponse( answer=result["result"], sources=[doc.metadata.get("source", "unknown") for doc in result["source_documents"]] )