Spaces:

Samarth991
/

Summarize-PhotoDocument

Sleeping

App Files Files Community

Samarth991 commited on Mar 28, 2024

Commit

5f2768f

verified ·

1 Parent(s): e433400

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -1

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from sentence_transformers import SentenceTransformer
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.prompts import PromptTemplate
 from langchain_community.llms.huggingface_hub import HuggingFaceHub
-from read_photodocument import convert_PDF_to_Text
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 import contextlib
@@ -61,6 +61,7 @@ def summarize_data(docs,llm_model,chain_type='refine'):
     prompt = PromptTemplate.from_template(prompt_template)
     refine_prompt = PromptTemplate.from_template(refine_template)
     chain = load_summarize_chain(llm=llm_model,
                             chain_type=chain_type,
@@ -76,6 +77,7 @@ def summarize_data(docs,llm_model,chain_type='refine'):
     consice_sumary = re.search("CONCISE SUMMARY:.*\.*$", output_text).group(0)
     dash_id = consice_sumary.find('-')
     return consice_sumary[:dash_id].replace('  ','\n')
     # matches = re.finditer(regex, output_text, re.DOTALL)
     # for matchNum, match in enumerate(matches, start=1):
     #     for groupNum in range(0, len(match.groups())):
@@ -115,6 +117,15 @@ def document_loader(temperature,max_tokens,api_key,model_name,file_path):
         was_truncated = conversion_stats["truncated"]
         print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
     if converted_txt:
         print("Document Processed ..")
         texts = process_documents(texts=converted_txt)

 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.prompts import PromptTemplate
 from langchain_community.llms.huggingface_hub import HuggingFaceHub
+from read_photodocument import convert_PDF_to_Text,convert_image_to_pdf
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 import contextlib
     prompt = PromptTemplate.from_template(prompt_template)
     refine_prompt = PromptTemplate.from_template(refine_template)
     chain = load_summarize_chain(llm=llm_model,
                             chain_type=chain_type,
     consice_sumary = re.search("CONCISE SUMMARY:.*\.*$", output_text).group(0)
     dash_id = consice_sumary.find('-')
     return consice_sumary[:dash_id].replace('  ','\n')
     # matches = re.finditer(regex, output_text, re.DOTALL)
     # for matchNum, match in enumerate(matches, start=1):
     #     for groupNum in range(0, len(match.groups())):
         was_truncated = conversion_stats["truncated"]
         print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
+    elif file_path.endswith('.jpg') or file_path.endswith('.jpeg'):
+        conversion_stats = convert_image_to_pdf(file_path,model)
+        converted_txt = conversion_stats["converted_text"]
+        num_pages = conversion_stats["num_pages"]
+        was_truncated = conversion_stats["truncated"]
+        print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
+    else:
+        return ("Invalid Format ....")
     if converted_txt:
         print("Document Processed ..")
         texts = process_documents(texts=converted_txt)