Spaces:
Sleeping
Sleeping
| # app.py | |
| from fastapi import FastAPI, UploadFile, File | |
| from pydantic import BaseModel | |
| from typing import List | |
| import fitz # PyMuPDF | |
| from transformers import pipeline | |
| from sentence_transformers import SentenceTransformer | |
| from langchain.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain.schema import Document | |
| from langchain.chains.question_answering import load_qa_chain | |
| from langchain.llms import HuggingFacePipeline | |
| from langchain_core.documents import Document as LangchainDocument | |
| # --- Init FastAPI --- | |
| app = FastAPI() | |
| # --- Summarizer --- | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| # --- Question Answering --- | |
| qa_pipe = pipeline("question-answering", model="deepset/roberta-base-squad2") | |
| # --- Embedding model --- | |
| embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5") | |
| # --- Text Splitter --- | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100) | |
| # --- Pydantic schemas --- | |
| class Summary(BaseModel): | |
| summary: str | |
| class KeyPoint(BaseModel): | |
| point: str | |
| class DocumentAnalysis(BaseModel): | |
| summary: Summary | |
| key_points: List[KeyPoint] | |
| class QARequest(BaseModel): | |
| question: str | |
| context: str | |
| class QAResponse(BaseModel): | |
| answer: str | |
| # --- PDF Text Extractor --- | |
| def extract_text_from_pdf(pdf_file: UploadFile) -> str: | |
| text = "" | |
| with fitz.open(stream=pdf_file.file.read(), filetype="pdf") as doc: | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| # --- Analyze Text (summarization) --- | |
| def analyze_text_structured(text: str) -> DocumentAnalysis: | |
| chunks = text_splitter.split_text(text) | |
| summaries = [] | |
| for chunk in chunks: | |
| result = summarizer(chunk, max_length=200, min_length=50, do_sample=False) | |
| if result: | |
| summaries.append(result[0]["summary_text"]) | |
| full_summary = " ".join(summaries) | |
| key_points = [KeyPoint(point=line.strip()) for line in full_summary.split(". ") if line.strip()] | |
| return DocumentAnalysis(summary=Summary(summary=full_summary), key_points=key_points) | |
| # --- Question Answering --- | |
| def answer_question(question: str, context: str) -> str: | |
| result = qa_pipe(question=question, context=context) | |
| return result["answer"] | |
| # --- PDF Upload + Analysis Route --- | |
| async def analyze_pdf(file: UploadFile = File(...)): | |
| text = extract_text_from_pdf(file) | |
| analysis = analyze_text_structured(text) | |
| return analysis | |
| # --- Question Answering Route --- | |
| async def ask_question(qa_request: QARequest): | |
| answer = answer_question(qa_request.question, qa_request.context) | |
| return QAResponse(answer=answer) | |
| # --- Embedding Search (FAISS) Demo --- | |
| async def search_chunks(file: UploadFile = File(...), query: str = ""): | |
| text = extract_text_from_pdf(file) | |
| chunks = text_splitter.split_text(text) | |
| documents = [LangchainDocument(page_content=chunk) for chunk in chunks] | |
| # Create FAISS vector store | |
| db = FAISS.from_documents(documents, embedding_model) | |
| # Similarity search | |
| results = db.similarity_search(query, k=3) | |
| return {"results": [doc.page_content for doc in results]} |