Spaces:

intoxication
/

wbrule

Configuration error

App Files Files Community

wbrule / haystack-api /pipelines_biobert.haystack-pipeline.yml

intoxication

Upload 12 files

3fe47db over 2 years ago

raw

history blame contribute delete

2.97 kB

	version: ignore

	components:
	- name: DocumentStore
	type: ElasticsearchDocumentStore
	params:
	host: localhost
	- name: Retriever # Selects the most relevant documents from the document store and passes them on to the Reader
	type: EmbeddingRetriever # Uses a Transformer model to encode the document and the query
	params:
	document_store: DocumentStore
	embedding_model: sentence-transformers/multi-qa-mpnet-base-dot-v1 # multi-qa-MiniLM-L6-dot-v1
	embed_meta_fields:
	- filename
	top_k: 10 # The number of results to return
	- name: BM25
	type: BM25Retriever
	params:
	document_store: DocumentStore
	top_k: 10

	- name: Joiner
	type: JoinDocuments
	params:
	join_mode: reciprocal_rank_fusion
	- name: Reader # The component that actually fetches answers from among the 20 documents returned by retriever
	type: FARMReader # Transformer-based reader, specializes in extractive QA
	params:
	model_name_or_path: dmis-lab/biobert-large-cased-v1.1-squad # dmis-lab/biobert-base-cased-v1.1-squad
	context_window_size: 700 # The size of the window around the answer span
	- name: FileTypeClassifier # Routes files based on their extension to appropriate converters, by default txt, pdf, md, docx, html
	type: FileTypeClassifier
	- name: TextConverter # Converts files into documents
	type: TextConverter
	- name: PDFConverter # Converts PDFs into documents
	type: PDFToTextConverter
	- name: Preprocessor # Splits documents into smaller ones and cleans them up
	type: PreProcessor
	params:
	# With a vector-based retriever, it's good to split your documents into smaller ones
	split_by: word # The unit by which you want to split the documents
	split_length: 250 # The max number of words in a document
	split_overlap: 20 # Enables the sliding window approach
	split_respect_sentence_boundary: True # Retains complete sentences in split documents
	language: en # Used by NLTK to best detect the sentence boundaries for that language


	# Here you define how the nodes are organized in the pipelines
	# For each node, specify its input
	pipelines:
	- name: query
	nodes:
	- name: Retriever
	inputs: [Query]
	- name: BM25
	inputs: [Query]
	- name: Joiner
	inputs: [Retriever, BM25]
	- name: Reader
	inputs: [Joiner]
	- name: indexing
	nodes:
	# Depending on the file type, we use a Text or PDF converter
	- name: FileTypeClassifier
	inputs: [File]
	- name: TextConverter
	inputs: [FileTypeClassifier.output_1] # Ensures this converter receives TXT files
	- name: PDFConverter
	inputs: [FileTypeClassifier.output_2] # Ensures this converter receives PDFs
	- name: Preprocessor
	inputs: [TextConverter, PDFConverter]
	- name: Retriever
	inputs: [Preprocessor]
	- name: DocumentStore
	inputs: [Retriever]