| from haystack.document_stores import InMemoryDocumentStore | |
| import pandas as pd | |
| import gradio as gr | |
| df=pd.read_parquet('df.parquet') | |
| dirname='lot3' | |
| df['fileclean']=df.file.str.replace(f'.*{dirname}/[^/]+/','').str.replace('[\(\)]','').str.replace('/[^/]+$','').str.replace('/',' ').str.replace('-',' ').str.replace(' 0+',' ') | |
| candidats=pd.read_parquet('candidats.parquet') | |
| df2=pd.read_parquet('df2.parquet') | |
| for c in df2.columns: | |
| candidats[c]=candidats[c].astype(str) | |
| df2[c]=df2[c].astype(str) | |
| candidats=candidats.merge(df2) | |
| document_store = InMemoryDocumentStore(use_bm25=True) | |
| docs=df.drop_duplicates(subset=['fileclean']).rename(columns={'fileclean':'content'}).to_dict(orient='records') | |
| document_store.write_documents(docs) | |
| from haystack.nodes import BM25Retriever | |
| retriever = BM25Retriever(document_store=document_store) | |
| from haystack.pipelines import DocumentSearchPipeline | |
| pipeline = DocumentSearchPipeline(retriever=retriever) | |
| def semanticsearch(query): | |
| result = pipeline.run( | |
| query=query, | |
| params={ | |
| "Retriever": { | |
| "top_k": 10 | |
| } | |
| },debug=False | |
| ) | |
| results=[] | |
| for document in result['documents']: | |
| result=document.meta | |
| result['score']=document.score | |
| results.append(result) | |
| results=pd.DataFrame(results) | |
| return results | |
| demo = gr.Interface( | |
| semanticsearch, | |
| [ | |
| gr.Dropdown(candidats.sort_values(by='text').text.tolist()), | |
| ], | |
| [gr.Dataframe()] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |