Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import re | |
| from langchain.embeddings.base import Embeddings | |
| from typing import List | |
| from sentence_transformers import SentenceTransformer | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain.prompts import PromptTemplate | |
| from langchain_community.llms.huggingface_hub import HuggingFaceHub | |
| from read_photodocument import convert_PDF_to_Text,convert_image_to_pdf | |
| from doctr.io import DocumentFile | |
| from doctr.models import ocr_predictor | |
| import contextlib | |
| from langchain.schema import Document | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.chains.summarize import load_summarize_chain | |
| import logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s %(levelname)s %(message)s", | |
| datefmt="%m/%d/%Y %I:%M:%S", | |
| ) | |
| DEVICE = 'cpu' | |
| FILE_EXT = ['pdf','jpg','jpeg'] | |
| DEFAULT_SYSTEM_PROMPT = "As an intelligent AI your task is to extract text from the pdf containing image and create a summary and higlight vital point within it ." | |
| MAX_NEW_TOKENS = 2048 | |
| DEFAULT_TEMPERATURE = 0.1 | |
| DEFAULT_MAX_NEW_TOKENS = 1024 | |
| MAX_INPUT_TOKEN_LENGTH = 2048 | |
| embedding_modelPath = 'multi-qa-mpnet-base-dot-v1'# "sentence-transformers/all-MiniLM-l6-v2" | |
| local_embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False}) | |
| with contextlib.redirect_stdout(None): | |
| ocr_model = ocr_predictor( | |
| "db_resnet50", | |
| "crnn_mobilenet_v3_large", | |
| pretrained=True, | |
| assume_straight_pages=True, | |
| ) | |
| def loading_file(): | |
| return "Loading..." | |
| def summarize_data(docs,llm_model,chain_type='refine'): | |
| prompt_template = """ | |
| Write a concise summary of the following text pointwise without repeating sentences: | |
| {text} | |
| CONCISE SUMMARY: | |
| """ | |
| refine_template = ( | |
| "Your job is to produce a final summary in points.\n" | |
| "Existing summary up to a certain point: {existing_answer}\n" | |
| "write the details of summary pointwise and avoid repetion." | |
| ) | |
| prompt = PromptTemplate.from_template(prompt_template) | |
| refine_prompt = PromptTemplate.from_template(refine_template) | |
| chain = load_summarize_chain(llm=llm_model, | |
| chain_type=chain_type, | |
| question_prompt=prompt, | |
| # refine_prompt=, | |
| return_intermediate_steps=False, | |
| input_key="input_documents", | |
| output_key="output_text", | |
| ) | |
| summary = chain({"input_documents": docs}, return_only_outputs=True) | |
| output_text = summary["output_text"].replace('\n',' ') | |
| consice_sumary = re.search("CONCISE SUMMARY:.*\.*$", output_text).group(0) | |
| return consice_sumary.replace(' ','\n') | |
| # matches = re.finditer(regex, output_text, re.DOTALL) | |
| # for matchNum, match in enumerate(matches, start=1): | |
| # for groupNum in range(0, len(match.groups())): | |
| # groupNum = groupNum + 1 | |
| # lines = match.group(groupNum).strip().split("\n") | |
| def process_documents(texts,data_chunk=1000,chunk_overlap=10): | |
| text_splitter = CharacterTextSplitter( | |
| separator="\n", | |
| chunk_size=data_chunk, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len | |
| ) | |
| texts = text_splitter.split_text(texts) | |
| docs = [Document(page_content=txt) for txt in texts] | |
| return docs | |
| def get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',temperature=0.01,max_tokens=4096,API_key=None): | |
| llm = HuggingFaceHub( | |
| huggingfacehub_api_token =API_key , | |
| repo_id=model_id, | |
| model_kwargs={"temperature":temperature, "max_new_tokens":max_tokens} | |
| ) | |
| return llm | |
| def document_loader(temperature,max_tokens,api_key,model_name,file_path): | |
| model = get_hugging_face_model(model_id=model_name,API_key=api_key,temperature=temperature,max_tokens=max_tokens) | |
| converted_txt = None | |
| if file_path.endswith('.pdf'): | |
| conversion_stats = convert_PDF_to_Text(PDF_file=file_path,ocr_model=ocr_model) | |
| converted_txt = conversion_stats["converted_text"] | |
| num_pages = conversion_stats["num_pages"] | |
| was_truncated = conversion_stats["truncated"] | |
| print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages)) | |
| elif file_path.endswith('.jpg') or file_path.endswith('.jpeg'): | |
| conversion_stats = convert_image_to_pdf(file_path,ocr_model) | |
| converted_txt = conversion_stats["converted_text"] | |
| num_pages = conversion_stats["num_pages"] | |
| was_truncated = conversion_stats["truncated"] | |
| print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages)) | |
| else: | |
| return ("Invalid Format ....") | |
| if converted_txt: | |
| print("Document Processed ..") | |
| texts = process_documents(texts=converted_txt) | |
| lines = summarize_data(docs=texts,llm_model=model) | |
| return lines | |
| else: | |
| return "Error in Processsing document " | |
| iface = gr.Interface( | |
| fn = document_loader, | |
| inputs = [gr.Slider(0.01, 0.1, value=0.01, step=0.01 , label="temperature", info="Choose between 0.01 to 0.1"), | |
| gr.Slider(512,MAX_INPUT_TOKEN_LENGTH,value=1024,step=512,label="max new tokens",info='Max new tokens'), | |
| gr.Textbox(label="Add API key", type="password"), | |
| gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model',info='LLM Service'), | |
| "file" | |
| ], | |
| outputs="text", | |
| description ="Summarize your PDF Document having Image • HuggingFace", | |
| ) | |
| iface.launch() | |