from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import pandas as pd #pandas library
import os #operating system
import json #json query
import csv #working with CSV

# We are authenticating the user through the gmail account.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

from google.colab import drive
drive.mount('/content/src')

file_directory = os.listdir('src_link')
#Columns for the table
Title = []
Content = []
for file in file_directory:
  if file.endswith('.txt'):
    Title.append(file)
    Content.append(open('src_link' + file, 'r').read()) ##You still need to replace the path here

your_csv = pd.DataFrame({'Title':Title, 'Content':Content}) #Building a table
your_csv #printing it

#Saving your CSV file
your_csv.to_csv('name.csv', index=False) #Change it with the directory you want to save

import logging

# It sets the logging level to INFO, defines a specific log message format including a timestamp and
#thread name, and creates a logger object named 'logger' with the same INFO logging level.
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(threadName)-10s %(message)s',)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

#This function is designed to catalog and write information about files in a directory tree into a CSV file
#named 'catalog.csv' located in the directory.
def catalog_walk(top):
  snapshot_file = '/snapshot'
  csvoptions = {'dialect': csv.excel}

  OUTFILE = open("/link", "a")
  #Only takes 1000000 and you can replace the number
  csvwriter = csv.writer(OUTFILE)
  maxfiles = 1000000
  big_loop(top, csvwriter, maxfiles)
  OUTFILE.close()

#This function iterates through files and directories under the specified top-level directory,extracts metadata
#for each file and directory, and writes this metadata to a CSV file using the provided csvwriter.
#It stops processing and returns when the total count of processed files exceeds the maxfiles.

def big_loop(top, csvwriter, maxfiles):
  fc = 0
  for path, root, dirs, files in walk(top=top, by_name=False, snapshot_file=snapshot_file):

      dirpath = '/'.join(path)
      id, _, _, mimeType, createdTime, modifiedTime = extract_metadata(root)
      md5Checksum = len(dirs)
      size = len(files)
      row = [id, dirpath]
      csvwriter.writerow(row)
      fc = fc + 1
      for f in files:
          if fc >= maxfiles:
              logger.info('quitting: {} files found'.format(fc))
              return
          fc = fc + 1
          id, md5Checksum, size, mimeType, createdTime, modifiedTime = extract_metadata(f)
          md5Checksum = f.get('md5Checksum', 'None')
          row = [id, md5Checksum, size, mimeType, createdTime, modifiedTime, dirpath,f['name']]
          csvwriter.writerow(row)

#This function recursively walks through a directory and retrieves information about directories and files,
#such as their names, IDs, and MIME types, and organizes them into a hierarchical structure.
#The function uses a stack-based approach to traverse the directory tree, allowing  it to handle nested directories
#effectively. It also includes functionality to optionally save and load a snapshot of the traversal state to
#resume processing later.

def walk(top='root', by_name=False, snapshot_file='stack_snapshot.json'):
    try:
        with open(snapshot_file) as f:
            stack = json.loads(f.read())
    except FileNotFoundError:
        logger.info('no stack snapshot found')
        stack = None
    if not stack:
        if by_name:
            top, = iterfiles(name=top, is_folder=True)
        else:
            top = service.files().get(fileId=top).execute()
            if top['mimeType'] != FOLDER:
                raise ValueError('not a folder: %r' % top)
        stack = [([top['name']], top)]
    while stack:
        logger.info('stack size {}'.format(len(stack)))
        with open(snapshot_file, 'w') as file:
            file.write(json.dumps(stack))
        path, top = stack.pop()
        logger.info('dir {}'.format(path))
        dirs, files = is_file = [], []
        for f in iterfiles(parent=top['id']):
            is_file[f['mimeType'] != FOLDER].append(f)
        logger.info('subdirs {} files {}'.format(len(dirs), len(files)))
        yield path, top, dirs, files
        if dirs:
            logger.debug('dirs {}'.format(dirs))
            logger.debug('path {}'.format(path))
            newstuff = [(path + [d['name']], d) for d in reversed(dirs)]
            logger.debug('extend: {}'.format(newstuff))
            stack.extend(newstuff)

#Iterfiles uses the Google Drive API  to search for files and folders based on specific criteria
#such as name, folder status, and parent folder. It retrieves information about these files and folders,
#such as their IDs, names, MIME types, and more. The function yields each file or folder one at a time,
#allowing for iteration over search results in a convenient way. In short this is efficient function
# for searching in a directory

def iterfiles(name=None, is_folder=None, parent=None, order_by='folder,name,createdTime'):
    q = []
    if name is not None:
        q.append("name = '%s'" % name.replace("'", "\\'"))
    if is_folder is not None:
        q.append("mimeType %s '%s'" % ('=' if is_folder else '!=', FOLDER))
    if parent is not None:
        q.append("'%s' in parents" % parent.replace("'", "\\'"))
    fields = ['id',
        'name', 'mimeType', 'md5Checksum', 'webViewLink',
        'createdTime', 'modifiedTime', 'size']
    fields = ['id', 'title', 'mimeType']
    params = {'pageToken': None, 'orderBy': order_by,
              'fields': 'kind, nextPageToken, incompleteSearch, files(id, name, mimeType, md5Checksum, webViewLink, createdTime, modifiedTime, size)'}
    if q:
        params['q'] = ' and '.join(q)
    while True:
        logger.debug('params {}'.format(params))
        response = service.files().list(**params).execute()
        logger.debug('response {}'.format(response))
        for f in response['files']:
            yield f
        try:
            params['pageToken'] = response['nextPageToken']
        except KeyError:
            return

#This Python function takes a object called node as input. It extracts various metadata/ information
#attributes such as ID, MD5 checksum, etc from the 'node' object and returns them as a tuple in the order
#If any attributes is missing it sets it to NONE
def extract_metadata(node):
    md5Checksum = node.get('md5Checksum', 'None')
    size = node.get('size', 'None')
    mimeType = node.get('mimeType', 'None')
    webViewLink = node.get('webViewLink', 'None')
    id = node.get('id', 'None')
    createdTime = node.get('createdTime', 'None')
    modifiedTime = node.get('modifiedTime', 'None')
    name = node.get('name', 'None')
    return id, md5Checksum, size, mimeType, createdTime, modifiedTime

# directory you want to catalog
top = 'directory_you_want_to_store'
snapshot_file = '/text_file_name.txt'
FOLDER = 'application/vnd.google-apps.folder'
catalog_walk(top)

import pytesseract
from pytesseract import TesseractError
from gcld3 import NNetLanguageIdentifier
from PIL import Image, ImageDraw
import lang
import lang.langcodes
import lang.detect
import os
import fitz
import pandas as pd
from io import StringIO
import re
import csv
import time
import warnings
import requests
import langdetect

#To Check what languges Pytessaract recognizes
pytesseract.get_languages()

url = 'https://r12a.github.io/scripts/arabic/images/kashida-justification.png'

# Use use own image URL in the url area to generate an image
src = Image.open(requests.get(url, stream=True).raw)
src

#These functions below are commonly used in evaluating the performance of text recognition or classification systems.
#`precision` calculates the proportion of characters in the 'output' that are correct compared to the 'truth.'
def precision(truth, output):
  return len([c for c in output if c in truth]) / len(output)
#`recall` calculates the proportion of characters in the 'truth' that are correctly detected in the 'output.'
def recall(truth, output):
  return len([c for c in truth if c in output]) / len(truth)

#timing the code session
%%timeit
correct = pytesseract.image_to_string(src, lang='ara')
correct

two_lang_out = pytesseract.image_to_string(src, lang='eng+ara')
correct = pytesseract.image_to_string(src, lang='ara')
precision(correct, two_lang_out), recall(correct, two_lang_out)

#Test Code to check if image gets converted to string
src = Image.open('./src.png')
src
pytesseract.image_to_string(src)

#checking the confidence level of of OCR!
conf = [c for c in meta.conf if c >= 0]
sum(conf) / len(conf)

#The code converts an image to grayscale and then applies a threshold to make pixels with values above 220
#white and pixels below 220 black.
def threshold(im):
  return im.convert('LA').point(lambda p: 255 * (p > 220))

#language detection with preexisting python libraries
detected = langdetect.detect_langs('aseklfae fhuasdf')[0]
detected.lang in {'cy': 23, 'ara': 234}

#This function reads data from a Sheets specified by 'title' and 'worksheet,' and then creates a DataFrame
#using the data. If 'has_headers' is True, it assumes the first row contains column names.
def get_df(title, gc, worksheet=0, has_headers=True):
    contents = gc.open(title).get_worksheet(worksheet).get_all_values()
    if has_headers:
        return pd.DataFrame.from_records(
            data=contents[1:],
            columns=contents[0]
        )
    return pd.DataFrame.from_records(contents)

#This code performs image erosion on 'img' using a 5x5 kernel and displays the result.
kernel = np.ones((5, 5))
eroded = cv.erode(img, kernel)
plt.imshow(eroded)

#This code detects and draws lines in the 'img' using the Hough Line Transform(algorithm), creating a new image
# 'lines_img' with the detected lines highlighted.
lines = cv.HoughLinesP(canny, rho=1, theta=0.01, threshold=0, minLineLength=50)
lines_img = np.copy(img) * 0
for line in lines:
  for x1, y1, x2, y2 in line:
    lines_img = cv.line(lines_img, (x1, y1), (x2, y2), (255, 255, 255), 5)
plt.imshow(lines_img)

#This code calculates the average angle of detected lines in 'lines' and estimates the rotation angle
#of a page based on those lines.
angles = []
for line in lines:
  for x1, y1, x2, y2 in line:
    angles.append(math.atan2(y2-y1, x2-x1))
print('The page is rotated by {} radians.'.format(sum(angles) / len(angles)))

#Reads a catalog, tracks processed files, and performs OCR on remaining files.
#Iterates through the catalog, applies OCR, and updates the completion status.
def run(catalog_path, out, success):
  catalog = pd.read_csv(catalog_path)
  with open(success, 'r') as f:
    try:
      completed = int(f.read())
    except ValueError:
      completed = 0
  for idx in catalog.index[completed:]:
    row = catalog.iloc[idx]
    id = str(row['ID'])
    path = os.path.join(row['Folder'], row['Name'])
    save_ocr(path, os.path.join(out, id))
    completed += 1
    with open(success, 'w') as f:
      f.write(str(completed))

#This class is used to estimate the accuracy of text extraction  from images. It does so by displaying images
#and prompting the user to verify accuracy, and collecting accuracy data. The 'mean' method calculates and
#returns the mean accuracy from the collected data.
class TextAccuracyEstimator:

  def __init__(self, root, text, random):
    self.accuracies = list()
    self._text = text
    self._page_paths = [
        os.path.join(root, 'images', path) for path in self._text.page_paths
        if path.endswith('ppm')
    ]
    self.random = random

  def run(self, n):
    while len(self.accuracies) < n:
      page_idx = self.random.integers(len(self._page_paths))
      metadata = self._text.page_metadata[page_idx]
      page = Image.open(self._page_paths[page_idx])
      word_idx = self.random.choice(metadata.index)
      row = metadata.loc[word_idx]
      if (pd.isna(row.text)):
        continue
      fig, axes = plt.subplots(1, 4)
      fig.set_figwidth(15)
      for i in range(4):
        if row.left < page.size[0] and row.top < page.size[1]:
          word = page.crop(
                (row.left, row.top, row.left+row.width, row.top+row.height))
          try:
            axes[i].imshow(word)
          except:
            warnings.warn('Could not render {}'.format(str(word)))
        page = page.rotate(90, expand=True)
      plt.show(block=False)
      plt.pause(0.1)
      display(Markdown('    {}'.format(row.text)))
      self.accuracies.append('y' in input('Is this correct (y/n)?\n').lower())

  def mean(self):
    return sum(self.accuracies) / len(self.accuracies)

#This class performs cross-text OCR accuracy analysis, with methods to calculate accuracy,
#find the mean, and visualize the accuracy distribution.
class CrossTextAnalysis:

  def __init__(self, root, random):
    self.root = root
    self.random = random
    self.texts = list()
    self.accuracy_estimators = list()

  def run(self, n):
    for text, dir in get_texts(self.root):
      self.texts.append(text)
      estimator = TextAccuracyEstimator(dir, text, self.random)
      self.accuracy_estimators.append(estimator)
      estimator.run(n)

  def accuracies(self):
    return [estimator.mean() for estimator in self.accuracy_estimators]

  def mean(self):
    return sum(self.accuracies()) / len(self.accuracy_estimators)

  def hist(self):
    plt.hist(self.accuracies())
    plt.xlabel('Accuracy')
    plt.title('Distribution of OCR Accuracy by Text')
    plt.show(block=False)

#running the ocr file as a whole on large documents
run(
    os.path.join(output_path, 'catalog.csv'),
    output_path,
    os.path.join(output_path, 'success.txt')
)

#We have two functions, get_model and get_tokenizer, which return a T5 model and a T5 tokenizer, respectively.
def get_model():
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    return model

def get_tokenizer():
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    return tokenizer


tokenizer = get_tokenizer()
model = get_model()

#This function generates summarizations for a list of tokenized texts using the T5 model, applying specified
#parameters and constraints, and then concatenates the generated summaries into a single output.
def get_summarization(tokenized_texts, min_length, max_length):
    outputs = ""
    for text in tokenized_texts:
        summary_ids = model.generate(text,
                                    num_beams=4,
                                    no_repeat_ngram_size=2,
                                    min_length=min_length,
                                    max_length=max_length,
                                    early_stopping=True)
        output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        outputs += output
    return outputs

#This function preprocesses a long text by removing leading/trailing spaces and line breaks, then splits
#it into smaller chunks of 512 characters each. It adds "summarize:" to each chunk, tokenizes them using a
#T5 tokenizer, and returns a list of tokenized chunks.
def preprocess(texts):
    preprocess_texts = texts.strip().replace("\n", "")
    texts1 = [preprocess_texts[n*512: (n+1)*512] for n in range(len(preprocess_texts) // 512)]
    tokens=[]
    for text in texts1:
        t5_prepared_Text = "summarize: "+ text
        tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt")
        tokens.append(tokenized_text)
    return tokens

#The code processes a batch of text data, generates summaries for each text, and saves the summaries in
#separate text files with incremental filenames.
su.shape[0]
contents = su['Content']
summaries = []

for text in contents:
    tokenized_texts1 = preprocess(text)
    outputs = get_summarization(tokenized_texts1, 10, 30)
    name = "src" + str(i) + ".txt"
    with open(name,'w') as f:
        f.write(outputs)
    f.close()
    i += 1

#The code analyzes an image to detect a "Right Upper" region by examining columns, and if a specific pattern of
#pixels is found, it prints "Right Upper detected."
def checkLeftSuper(img):
  left, bottom, dim = img.shape
  firstBlack = 0
  secondWhite = 0
  for i in range(bottom):
    perpendicularLine = img[:, i, 0]
    if sum(perpendicularLine) != left:
      firstBlack = i
      break
  for i in range(firstBlack, bottom):
    perpendicularLine = img[:, i, 0]
    if sum(perpendicularLine) == left:
      secondWhite = i
      break
  horizontalLine = img[left//2, :secondWhite, 0]
  if sum(horizontalLine) == secondWhite:
    print("Left Upper detected.")

#This function checks for a "Right Upper" region in an image and prints "Right Upper detected" if found.
def checkRightSuper(img):
  left, bottom, dim = img.shape
  firstBlack = 0
  secondWhite = 0
  for i in reversed(range(bottom)):
    perpendicularLine = img[:, i, 0]
    if sum(perpendicularLine) != left:
      firstBlack = i
      break
  for i in reversed(range(0, firstBlack)):
    perpendicularLine = img[:, i, 0]
    if sum(perpendicularLine) == left:
      secondWhite = i
      break
  horizontalLine = img[left//2, secondWhite:, 0]
  if sum(horizontalLine) == (bottom-secondWhite):
    print("Right Upper detected.")

#Checks for superscript in an image
def checkIfSuper(img):
  plt.imshow(img)
  checkLeftSuper(img)
  checkRightSuper(img)

#This code batch-processes PDF files, extracting text while ensuring correct orientation. It iterates
#through the PDFs, converts pages to images, and rotates them if needed based on word frequencies.
#Extracted text is saved in text files, and orientation errors are logged in a separate file.

data_dir = 'src' #Input dir: a folder contains all your pdf
output_dir = 'src' #Output dir: any folder you specify

output_error_file = output_dir + 'failed_pages.txt' #Page the are failed in ocr process specified to pdf name and page number

for idx, pdf in enumerate(os.listdir(data_dir)[1:]):
  without_pdf = pdf.replace('.pdf', '')
  temp_input_name = data_dir + '{}'

  PDF_file = data_dir+'{}'.format(pdf)
  temp_output_name = output_dir + '{}.txt'
  output_file = temp_output_name.format(without_pdf)
  print("The pdf you are working on is "+PDF_file+"\n")
  pages = convert_from_path(PDF_file)

  if os.path.isdir("src#"+str(idx)) == False:
    print("New folder is created")
    os.mkdir("src#"+str(idx))
  print(len(pages))


  for page_idx, page in enumerate(pages):
    img_count = page_idx + 1
    print("image idx: ", img_count)
    save_name = 'src#'+str(idx)+"/page_"+str(img_count)+".jpg"
    filename = "page_"+str(img_count)+".jpg"
    flag = False

    for i in range(4):
      text = str(((pytesseract.image_to_string(Image.open(save_name)))))
      cnt = Counter()
      cnt.update(text.split())
      print(cnt['the'],cnt['de'],cnt['bu'],cnt['im'],cnt['che'],cnt['del'],cnt['e'],cnt['la'])
      if((cnt['the'] < 3) == True and (cnt['de'] < 3) == True and (cnt['bu'] < 3) == True and (cnt['im'] < 3) == True and ((cnt['e'] + cnt['che'] + cnt['del'] + cnt['la'] < 5) == True)):
        page2 = page.rotate(90*i)
      else:
        flag = True
        break

    if flag == False:
      f_error = open(output_error_file, "a")
      message = "Page " + str(img_count) + " in PDF " + pdf + " failed to find the correct orientation.\n"
      f_error.write(message)
      f_error.close()
    else:
      text = str(((pytesseract.image_to_string(Image.open(save_name)))))

      f = open(output_file, "w+")
      text = text.replace('-\n', '')
      f.write(text)
      f.close()

#This function calculates the sum of word frequencies in a Counter, giving higher weights to the top 2, 5, and 10 most common words.
def dict_common_sum(cntr):
  sum = 0
  for word, freq in cntr.most_common(2):
    sum += freq * 2
  for word, freq in cntr.most_common(5):
    sum += freq * 2
  for word, freq in cntr.most_common(10):
    sum += freq
  return sum

#1. It processes files and directories in a specified directory, collecting metadata such as ID, size, MIME type, and timestamps for each directory.
#2. For each PDF file in the directory, it extracts pages and determines the correct orientation based on text analysis.
#3. It saves the oriented pages as JPEG images and logs orientation errors in "failed_pages.txt."
#4. The code randomly selects a subset of pages (up to 5 or 10% of total pages) for orientation analysis.
#5. It rotates pages to identify the correct orientation and extracts text from them using OCR.
#6. It collects and saves text data, page IDs, and PDF names for further processing.

def big_loop(top, csvwriter, maxfiles):

  fc = 0

  if os.path.isdir(output_dir) == False:
    os.mkdir(output_dir)


  for path, root, dirs, files in walk(top=top, by_name=False):
      dirpath = '/'.join(path)
      id, _, _, mimeType, createdTime, modifiedTime = extract_metadata(root)
      md5Checksum = len(dirs)
      size = len(files)
      row = [id, md5Checksum, size, mimeType, createdTime, modifiedTime, dirpath]
      csvwriter.writerow(row)
      fc = fc + 1


      for pdf_index in range(len(files)):
        folder_name = str(files[pdf_index]['id'])+'/'
        folder_name_failed = str(id)+'/failed_pages.txt'
        pdf_numbers = len(files)
        file_name = input_dir[:-1]+'/'+files[pdf_index]['name']
        if ((file_name.endswith('.pdf') == False) and (file_name.endswith('.PDF')== False)):
          continue
        pages = convert_from_path(file_name)
        if os.path.isdir(output_dir+folder_name) == False:
          print("New folder is created")
          os.mkdir(output_dir+folder_name)
        csv_id = []
        csv_text = []
        pdf_name = []
        output_error_file = output_dir +folder_name+ 'failed_pages.txt'
        cnt0, cnt1, cnt2, cnt3 = Counter(), Counter(), Counter(), Counter()
        num_to_select = max(min(5, len(pages)), int(0.1 * len(pages)))
        test_pages = random.sample(pages, num_to_select)


        for page_idx, page in enumerate(test_pages):
          img_count = page_idx + 1
          print("image idx: ", img_count)
          save_name = output_dir+folder_name+"page_"+str(img_count)+".jpg"
          page.save(save_name, 'JPEG')


          for i in range(4):
            text = str(((pytesseract.image_to_string(Image.open(save_name)))))
            if (i == 0):
              cnt0.update(text.split())
            if (i == 1):
              cnt1.update(text.split())
            if (i == 2):
              cnt2.update(text.split())
            if (i == 3):
              cnt3.update(text.split())
            page2 = page.rotate(90*(i + 1))
            page2.save(save_name,  'JPEG')
        w_0, w_1, w_2, w_3 = dict_common_sum(cnt0), dict_common_sum(cnt1), dict_common_sum(cnt2), dict_common_sum(cnt3)
        top_orient = max(w_0, w_1, w_2, w_3)

        if w_0 == top_orient:
          k = -1
        if w_1 == top_orient:
          k = 0
        if w_2 == top_orient:
          k = 1
        if w_3 == top_orient:
          k = 2

        print("Orientation to be used" + str(k))
        print(cnt0)
        print(len(cnt0))
        print(w_0)
        print(cnt1)
        print(len(cnt1))
        print(w_1)
        print(cnt2)
        print(len(cnt2))
        print(w_2)
        print(cnt3)
        print(len(cnt3))
        print(w_3)

        for page_idx, page in enumerate(pages):
          img_count = page_idx + 1
          print("image idx: ", img_count)
          save_name = output_dir+folder_name+"page_"+str(img_count)+".jpg"
          page.save(save_name, 'JPEG')
          print("Page")
          page2 = page.rotate(90*(k + 1))
          page2.save(save_name,  'JPEG')
          text = str(((pytesseract.image_to_string(Image.open(save_name)))))
          csv_text.append(text)
          csv_id.append(str(files[pdf_index]['id'])+"_"+str(img_count))
          pdf_name.append(files[pdf_index]['name'])

#This line of code extracts text from the image 'translit1' using the 'xyz' language.
print(pytesseract.image_to_string("src", lang='xyz'))

#Extract text data (including coordinates and other information) from the image 'src' using the 'xyz' language.
print(pytesseract.image_to_data("src", lang='xyz'))

#Extracts text boxes from the image 'src' and then splits the result into lines.
pytesseract.image_to_boxes("src").split('\n')

#printing single characters
for box in pytesseract.image_to_boxes("src").split('\n'):
  if box:
    left, bottom, right, top = (int(s) for s in box.split()[1:-1])
    print(left, bottom, right, top)
    try:
      plt.imshow("src".crop((left, "src".height - top, right, "src".height - bottom)))
    except: pass
    plt.show(block=False)
    plt.pause(0.01)

#This code processes the input string 'ṬṭÁáÚúĀāŪū' using a function called 'deaccent' to remove diacritics or accents, resulting in 'TtAaUuAaUu'.
deaccent('ṬṭÁáÚúĀāŪū')

#This code helps select characters for multilingual text processing by assigning weights based on character similarity and language context.
#It then chooses the character with the highest weight for accurate text processing.
universal_chars = set('`1234567890-=~!@#$%^&*()_+[]\\;\',./{}|:"<>?')

charsets = {
  'ces': set('AÁBCČDĎEÉĚFGHChIÍJKLMNŇOÓPRŘSŠTŤUÚŮVXYÝZŽaábcčdďeéěfghchiíjklmnňo'
             'óprřsštťuúůvxyýzž0123456789') | universal_chars,
  'eng': set('AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz'
            ) | universal_chars,
}

char_lookalikes = [
  set('8S¥sš'),
  set('I[|{/\\'),
  set('I]|}/\\'),
  set('|l1'),
  set('.,'),
  set('—-'),
]

def char_kin(char0, char1):
  return deaccent(char0) == deaccent(char1) or any(
    (char0 in s and char1 in s) for s in char_lookalikes)

def choose_char(chars):
  weights = {
    c: (len(chars) - 1 - i) / (len(chars) ** 2)
    for i, (_, c) in enumerate(chars)
    if c != ''
  }
  for language, c in chars:
    if language in charsets:
      weights[c] += 1 / len(chars) # This is just a tie-breaker.
      for other in weights:
        if other in charsets[language] and other != c:
          weights[other] -= 1
  ret = max(weights.keys(), key=lambda c: weights[c])
  print('DEBUG: chars={}, choosing {}'.format(str(chars), ret))
  return ret

#This code uses PyTesseract to perform character-based transliteration on an image. It replaces characters
#with their corresponding transliterated counterparts, considering similarity and context.
def translit(image, box_sim_thresh=0.65):
  flagship_language = 'ces'

  other_languages = [k for k in charsets.keys() if k != flagship_language]

  data = pytesseract.image_to_data(
    image, lang=flagship_language, output_type=Output.DICT)

  flagship_boxes = pytesseract.image_to_boxes(
    image, lang=flagship_language, output_type=Output.DICT)

  other_boxes = {
        language: pytesseract.image_to_boxes(
          image, lang=language, output_type=Output.DICT)
        for language in other_languages
      }
  chars = list()


  for char0, box0 in zip(flagship_boxes['char'], zip(
      flagship_boxes['left'], flagship_boxes['bottom'],
      flagship_boxes['right'], flagship_boxes['top']
      )):
    matches = [(flagship_language, char0)]

    for other in other_languages:
      for char1, box1 in zip(other_boxes[other]['char'], zip(
          other_boxes[other]['left'], other_boxes[other]['bottom'],
          other_boxes[other]['right'], other_boxes[other]['top']
          )):
        sim = box_sim(*box0, *box1)
        if char1 in charsets[other] and (
            sim >= box_sim_thresh or (sim > 0 and char_kin(char0, char1))):
          matches.append((other, char1))
    chars.append((char0, box0, choose_char(matches)))

  for i, (left, top, width, height, conf, word0) in enumerate(zip(
      data['left'], data['top'], data['width'], data['height'],
      data['conf'], data['text']
      )):
    if int(conf) >= 0:
      right, bottom = left + width, top + height
      top, bottom = image.height - top, image.height - bottom
      word = ''
      for char0, box, char in chars:
        if (char0 in word0) and box_sim(*box, left, bottom, right, top) > 0:
          word += char
      data['text'][i] = word
  return data_to_string(data['text'])

import pickle
from scipy import stats
with open('file.pickle', 'rb') as dbfile:
  src = pickle.load(dbfile)

#This code computes and visualizes the distribution of 'heights' and 'confidences' from the 'src.metadata'.
#It creates a histogram for 'heights' and a scatter plot of 'heights' against 'confidences'.
heights = []
for data in src.metadata:
  if data is not None:
    heights.append(data.height.median())
plt.hist(heights, bins=list(range(0, 1, 1)))

confidences = []
for data in src.metadata:
  if data is not None:
    confidences.append(data.conf.median())
plt.scatter(heights, confidences, s=1)
plt.xlim(1, 30)

#calculating pearson r which is way to find correlation between heights and confidences
stats.pearsonr(heights, confidences)

#This code calculates and displays the moving average of confidence ('movmeans') for bins of median word heights.
#It groups confidence values by word height ranges, computes the mean confidence for each range, and then
#visualizes this data using a scatter plot with 'median word height' on x-axis and 'mean confidence' on y-axis.
movmeans = []
bins = []
for lower in list(range(10, 40, 2)):
  bin = [confidences[i] for i, height in enumerate(heights) if lower <= height <= lower+2]
  if bin:
    movmeans.append(sum(bin) / len(bin))
    bins.append(lower+1)
plt.scatter(bins, movmeans)
plt.ylabel('mean confidence')
plt.xlabel('median word height');

#computing the median for height using inbuilt functions
median(heights)

# histogram for median height
median_heights = [
           data.height.median()
           for data in [data for data in src.metadata if data is not None]
]
plt.hist(median_heights, bins=list(range(100)))
plt.xlabel('height')
plt.ylabel('frequency')

#This code calculates and reports the mean wordwise and pagewise confidences from the 'src' data,
#highlighting the number of pages analyzed with a confidence score of less than 75.
mean_confidences = list(
    conf for conf in src.mean_confidences if not pd.isna(conf))
print('The mean wordwise confidence was {:.4f},\nthe mean pagewise confidence '
  'was {:.4f},\nand {} pages were analyzed with a confidence of less than '
  '75.'.format(
      statistics.mean(
          confidence
          for data in [data for data in src.metadata if data is not None]
          for confidence in data.conf if confidence > -1
      ),
      statistics.mean(mean_confidences),
      sum(conf < 75 for conf in mean_confidences)
  ))

#Example of Output
#The mean wordwise confidence was 88.3453,
#the mean pagewise confidence was 86.2748,
#and 31 pages were analyzed with a confidence of less than 75.

#This code creates a bar plot with Seaborn, displaying the categorical distribution of 'true_orientation'
#in degrees clockwise, including cumulative percentage distribution, helping analyze page orientation data.
def bar(series, confidence=0.95):
  categories = list(series.unique())
  x = list()
  y = list()
  for value in series:
    for category in categories:
      x.append(category)
      y.append(1 if category == value else 0)
  frequencies = {
      c: sum(y[i] for i, a in enumerate(x) if a == c) / len(series)
      for c in categories
  }
  categories.sort(key=(lambda c: frequencies[c]), reverse=True)
  ax = sns.barplot(
      x=x,
      y=y,
      ci=confidence*100,
      order=categories,
  )
  ax.set_ylabel('Frequency')
  ax.plot(range(len(categories)), [
      sum(frequencies[c] for c in categories[:i+1])
      for i in range(len(categories))
      ], 'm', marker='*', linestyle='solid')
  return ax
ax = bar(data.true_orientation, confidence=0.99)
ax.set_xlabel('Orientation (Degrees Clockwise)')
ax.set_title('Categorical Distribution of Page Orientation');

#This code uses bootstrapping to analyze the 'true_orientation' data by comparing the proportions of '270' and '90' values and stores the results in 'ab_test'.
def boot(series, statistic, n=100000):
  values = list()
  for _ in range(n):
    sample = series.sample(frac=1, replace=True)
    values.append(statistic(sample))
  return values
ab_test = boot(
    data.true_orientation,
    statistic=(
        lambda s: (sum(s == '270') - sum(s == '90')) / len(s.index)
    )
)

#This code generates a summary table of frequencies for the 'true_orientation' data, including confidence intervals.
#It calculates the lower and upper bounds for the specified confidence level and then returns info in a DataFrame format.
def frequencies(series, confidence=0.95, n_boot=10000):
  alpha = 1 - confidence
  counts = series.value_counts()
  frequencies = counts / len(series.index)
  boot_results = [
      boot(series,
           lambda seq: (sum(1 if s == a else 0 for s in seq) / len(seq)),
           n_boot)
      for a in counts.index
  ]
  lower = [
           np.percentile(boot_results[i], alpha / 2 * 100)
           for i in range(len(boot_results))
  ]
  upper = [
           np.percentile(boot_results[i], 100 - alpha / 2 * 100)
           for i in range(len(boot_results))
  ]
  return pd.DataFrame(
      data={
          'count': counts,
          'frequency': frequencies,
          '{:.0f}% CI Lower'.format(confidence * 1e2): lower,
          '{:.0f}% CI Upper'.format(confidence * 1e2): upper,
          },
      index=frequencies.index
  )
frequencies(data.true_orientation)

#This is an example for two tailed test if you want more info please refer to online sources
2 * sum(x < 0 for x in ab_test) / len(ab_test)

#This code defines a function, 'mean_confidence_bar,' to create a bar chart showing the mean confidence distribution
#for a categorical variable ('x') in the 'data' and sorts categories by mean confidence.
def mean_confidence_bar(data, x):
  counts = data[x].value_counts()
  categories = [c for c in data[x].unique() if counts[c] > 1]
  categories.sort(
      key=lambda c: data[(data[x] == c) & (data.mean_confidence != '')
         ].mean_confidence.mean(),
      reverse=True
  )
  ax = sns.barplot(
      x=x,
      y='mean_confidence',
      data=data[data.mean_confidence != ''],
      order=categories
      )
  ax.set_title('Mean Confidence by {}'.format(x))
  return ax

mean_confidence_bar(data, 'true_orientation')

#Conducts bootstrapping to estimate a global accuracy value based on projected accuracies and the
#number of words in each text.
global_accuracies = boot(
    pd.DataFrame(data={
        'projected_accuracy': projected_accuracies,
        'n_words': [
           len(text.split()) for text in data[data.mean_confidence != ''].text
        ]
    }),
    lambda df: df.apply(
        (lambda row: row.n_words * row.projected_accuracy),
        axis=1
        ).sum() / df.n_words.sum(),
    n=1000
)

#This Python code defines a generic `WeightTracker` class that tracks the weights of items in a list, allowing
#for weighted item addition and reordering. Additionally, it provides a list of common languages for OCR
 #(Optical Character Recognition) purposes.

Item = TypeVar('Item', bound=Hashable)

class WeightTracker(Generic[Item]):
    def __init__(
        self,
        items: Sequence[Item],
        presorted: bool = True,
        r: float = 0.5
    ):

        self.items = list(items)
        self.r = r
        self.weights = {
            item: (1 / (i + 1) if presorted else 0)
            for i, item in enumerate(items)
        }
#Increases the weight given to `item` and re-orders the items by weight.
    def add_weight(self, item: Item):

        self.weights = {item: self.weights[item]
                        * self.r for item in self.items}
        self.weights[item] = self.weights.get(item, 0) + 1
        self.items.sort(key=lambda item: self.weights[item], reverse=True)

COMMON_LANGUAGES = [
    'eng', 'tur', 'ara', 'deu', 'fra', 'rus', 'spa', 'nld',
    'jpn', 'chi_sim', 'chi_tra', 'heb', 'ita', 'dan', 'swe',
    'ell', 'lat', 'fin'
]

#This class, named `Text`, is designed for performing Optical Character Recognition (OCR) on documents.
 #  It includes various functionalities such as:
#1. Initializing OCR parameters: It allows you to set parameters like image scaling, confidence thresholds, and language tracking.
#2. Analyzing and processing pages: It reads text data from document pages, applies OCR, and records metadata, language, and confidence scores.
#3. Handling language and scale optimization: It dynamically adjusts the language and image scale for OCR to improve accuracy.
#4. Correcting OCR results: It has a correction mechanism that can replace low-confidence OCR results with higher-confidence alternatives.
#5. Saving OCR output: It can save the OCR results, including text, language, and metadata, to CSV files and other formats.

class Text:

    global_possible_languages = COMMON_LANGUAGES + list(filter(
        lambda key: key not in COMMON_LANGUAGES,
        lang.langcodes.TESSERACT.keys()
    ))
    languages_by_script = {
        'Latin': {
            'eng', 'tur', 'deu', 'fra', 'spa', 'nld', 'ita', 'dan', 'swe',
            'fin'
        },
        'Arabic': {'ara'},
        'Cyrillic': {'rus'},
        'Greek': {'ell'},
        'Japanese': {'jpn'},
        'Japanese_vert': {'jpn'},
        'Han': {'chi_sim', 'chi_tra'},
        'Hebrew': {'heb'},
    }
    iso2tess = {
        'en': 'eng',
        'tr': 'tur',
        'ar': 'ara',
        'de': 'deu',
        'fr': 'fra',
        'ru': 'rus',
        'es': 'spa',
        'nl': 'nld',
        'ja': 'jpn',
        'zh': 'chi_sim',
        'zh': 'chi_tra',
        'he': 'heb',
        'it': 'ita',
        'da': 'dan',
        'sv': 'swe',
        'el': 'ell',
        'la': 'lat',
        'fi': 'fin'
    }

    default_image_scale = 1.75
    alternate_image_scales = (2, 4)
    word_height_range = (14, 17)
    target_word_height = 15.5
    target_mean_conf = 90
    max_unreadable = 5
    max_n_foreign_words = 30

    def __init__(
        self,
        src: os.PathLike,
        out: os.PathLike,
        coarse_thresh=75,
        min_relative_conf=0,
        image_area_thresh=0.5,
        text_len_thresh=100,
        languages: Optional[WeightTracker] = None,
        second_languages=None,
        verbose=False
    ):

        self.src = src
        self.out = out
        self.coarse_thresh = coarse_thresh
        self.min_relative_conf = min_relative_conf
        self.image_area_thresh = image_area_thresh
        self.text_len_thresh = text_len_thresh
        self.languages = (
            WeightTracker(Text.global_possible_languages, presorted=True)
            if languages is None else languages
        )
        self.second_languages = (
            WeightTracker(Text.global_possible_languages, presorted=True)
            if second_languages is None else second_languages
        )
        self.verbose = verbose
        self.annotator = lang.detect.get_language_annotator()
        self.texts = list()
        self.metadata = list()
        self.orientations = list()
        self.page_languages = list()
        self.mean_confidences = list()
        self.used_original_texts = list()
        self.times = list()
        self.scales = list()


    def save_ocr(self):
        t0 = time.time()
        document = fitz.open(self.src)  # type: ignore
        for i, page in enumerate(document):
            if self.verbose:
                print('{} out of {} pages analyzed in {:.2f} seconds...'
                      ''.format(i, len(document), time.time() - t0))
            self._analyze_page(page)
        os.makedirs(self.out, exist_ok=True)
        pd.DataFrame(data={
            'text': self.texts,
            'orientation': self.orientations,
            'language': self.page_languages,
            'mean_confidence': self.mean_confidences,
            'used_original_text': self.used_original_texts,
            'time': self.times,
            'scale': self.scales,
        }).to_csv(os.path.join(self.out, 'page.csv'))
        self.save()


    def _analyze_page(self, page: fitz.Page):

        original_text = page.get_text()  # type: ignore
        if (
            total_image_area(page) / page.bound().getArea()
            < self.image_area_thresh
            and not len([a for a in original_text if a == '�'])
            > self.max_unreadable
        ):
            metadata, orientation_used, scale = None, None, None
            language = detected_language(original_text)
            self.texts.append(original_text)
            self.mean_confidences.append(None)
            used_original_text = True
        else:
            metadata, orientation_used, language, scale = self._run_ocr(
                page,
                (detected_language(original_text)
                 if len(original_text) >= self.text_len_thresh
                 else self.languages.items[0])
            )
            if mean_conf(metadata) < self.coarse_thresh:
                warnings.warn('Failed to analyze image.')
            self.texts.append(data_to_string(
                metadata.corrected if 'corrected' in metadata.columns
                else metadata.text
            ))
            self.mean_confidences.append(mean_conf(metadata))
            used_original_text = False
        self.languages.add_weight(language)
        self.metadata.append(metadata)
        self.orientations.append(orientation_used)
        self.page_languages.append(language)
        self.used_original_texts.append(used_original_text)
        self.times.append(time.time())
        self.scales.append(scale)


    def _run_ocr(
        self, page: fitz.Page, language_guess: str
    ) -> Tuple[
        Optional[pd.DataFrame],
        Optional[float],
        Optional[str],
        Optional[float]
    ]:
        orientation_used = 0
        scale_used = self.default_image_scale
        image = image_from_page(page, scale=scale_used).rotate(  # type: ignore
            orientation_used, expand=True)
        try:
            metadata = data(image, language_guess)
        except TesseractError as e:
            warnings.warn('Tesseract failed: ' + str(e))
            return (None, None, None, None)
        if mean_conf(metadata) < self.coarse_thresh:
            if self.verbose:
                print('First guess at orientation + language failed.')
            for scale in self.alternate_image_scales:
                image = image_from_page(page, scale=scale)
                try:
                    result = self._osd_assisted_analysis(image)
                    if mean_conf(result[-1]) > mean_conf(metadata):
                        orientation_used, language_guess, metadata = result
                        scale_used = scale
                    if mean_conf(metadata) >= self.coarse_thresh:
                        break
                except (TesseractError, ManagerError) as e:
                    warnings.warn('OCR failed: ' + str(e))
        metadata, language, scale_used = \
            self._final_pass_analysis(
                metadata, page, language_guess, scale_used, orientation_used
            )
        return (metadata, orientation_used, language, scale_used)


    def _osd_assisted_analysis(
        self,
        image: Image
    ) -> Tuple[float, str, pd.DataFrame]:
        osd_result = osd(image)
        image = image.rotate(osd_result['Orientation in degrees'], expand=True)
        if osd_result['Script'] not in lang.langcodes.SCRIPTS:
            raise ManagerError('The script detected by OSD, "{}", is not '
                               'supported.'.format(osd_result['Script']))
        poss_languages = lang.langcodes.SCRIPTS[osd_result['Script']]
        for language in self.languages.items:
            if language in poss_languages:
                return (osd_result['Orientation in degrees'], language,
                        data(image, language))
        raise ManagerError(
            'There exists no language known to this Text instance that '
            'corresponds to the script "{}".'.format(osd_result['Script'])
        )


    def _final_pass_analysis(
        self,
        metadata: pd.DataFrame,
        page: fitz.Page,
        language_used: str,
        scale_used: float,
        orientation_used: float,
        words_to_erase: Optional[pd.DataFrame] = None,
        max_depth: int = 5
    ) -> Tuple[pd.DataFrame, str, float]:
        median_height = metadata[is_text(metadata.text)].height.median()
        language = detected_language(
            data_to_string(metadata.text),
            default=language_used
        )
        if not max_depth:
            print(
                'WARNING: Failed to complete final pass of analysis on the '
                'text:\n{}.'.format(data_to_string(metadata.text))
            )
        elif (
            language != language_used
            or (
                mean_conf(metadata) < self.target_mean_conf
                and median_height
                and not pd.isna(median_height)
                and not (
                    self.word_height_range[0] <= median_height <=
                    self.word_height_range[1]
                )
            )
        ):
            optimal_scale = (
                scale_used * self.target_word_height / median_height
            ) if median_height else scale_used
            if self.verbose:
                print('Retrying. Language={}, scale={:.4f}'.format(
                    language, optimal_scale
                ))
            image = image_from_page(
                page, scale=optimal_scale
            ).rotate(  # type: ignore
                orientation_used, expand=True
            )
            if words_to_erase is not None:
                erase_words(image, words_to_erase, optimal_scale)
            result = data(image, language)
            if mean_conf(result) > mean_conf(metadata):
                metadata = result
                scale_used = optimal_scale
            # Filter out "words" that are likely non-textual
            metadata = metadata[  # FIXME: 2 is a magic number
                (metadata.height < 2 * median_height)
                & [isinstance(word, str) for word in metadata.text]
            ]
            annotations = [
                lang.langcodes.bcp47_to_tess(annotation, language)
                for annotation in self.annotator(metadata.text)
            ]
            was_wrong_lang = [
                annotation != language for annotation in annotations
            ]
            if sum(was_wrong_lang) > self.max_n_foreign_words:
                if self.verbose:
                    print('The following text has mixed languages:\n{}'.format(
                        inline_annotations(metadata.text, annotations)
                    ))
                wrong_lang_metadata = metadata[was_wrong_lang]
                if detected_language(
                    data_to_string(wrong_lang_metadata.text),
                    default=language
                ) != language:
                    metadata = metadata[[not x for x in was_wrong_lang]]
                    other_languages_metadata, _, _ = self._final_pass_analysis(
                            wrong_lang_metadata,
                            page,
                            language,
                            optimal_scale,
                            orientation_used,
                            (
                                metadata if words_to_erase is None
                                else pd.concat([metadata, words_to_erase])
                            ),
                            max_depth - 1 - (
                                len(metadata.index) < self.max_n_foreign_words
                            ) # Hasten approach to the end of recursion if the
                              # number of words taken out in this pass was small
                        )
                    # FIXME: Place the results in context according to location
                    # instead of simply appending them to the end
                    metadata = pd.concat([metadata, other_languages_metadata])
        return metadata, language, scale_used


    def _correct(self, image: Image, metadata: pd.DataFrame, min_conf: float):
        def corrector(row):
            if 0 <= row.conf < min_conf:
                word_image = image.crop(  # type: ignore
                    (row.left, row.top, row.left+row.width, row.top+row.height)
                )
                for language in self.second_languages.items:
                    metadata = data(
                        word_image, language,
                        config='--psm 8'  # Expect a single word.
                    )
                    if mean_conf(metadata) >= min_conf:
                        self.second_languages.add_weight(language)
                        correct_word = data_to_string(metadata.text).strip()
                        if self.verbose:
                            print('Correcting "{}" to "{}" (lang={})'.format(
                                row.text, correct_word, language))
                        return correct_word
            else:
                return row.text
        metadata['corrected'] = metadata.apply(corrector, axis=1)


    def save(self):
        self.annotator = None  # Null this out -- it need not be saved
        with open(os.path.join(self.out, 'analysis.pickle'), 'wb') as dbfile:
            pickle.dump(self, dbfile)

#Returns items in decreasing order by frequency, without any repetitions.
def by_frequency(items: Iterable[Any]) -> List[Any]:

    frequencies = dict()
    ret = list()
    for item in items:
        frequencies[item] = frequencies.get(item, 0) + 1
        if item not in ret:
            ret.append(item)
    return sorted(ret, key=lambda item: -frequencies[item])

#Erases the words that appear in `data` from `img`.
def erase_words(img: Image, data: pd.DataFrame, scale: float = 1):
    d = ImageDraw.Draw(img)
    def erase_word(word):
        d.rectangle(
            (
                word.left * scale,
                word.top * scale,
                (word.left + word.width) * scale,
                (word.top + word.height) * scale
            ),
            fill='#fff'
        )
    data.apply(erase_word, axis=1)

#Pairs strings with annotations and presents them in a human-readable format.
def inline_annotations(words: Sequence[str], annotations: Sequence[str]) -> str:
    return ' '.join(
        f'{word}->{annotation}'
        for word, annotation in list(zip(words, annotations))
    )

#Returns the detected language of `text`, using the LangCode recognized by Tesseract
def detected_language(
    text: str,
    default: str = 'eng',
    nnli: NNetLanguageIdentifier = NNetLanguageIdentifier(1, 700)
    ):
    if not text.strip():
        return default
    result = nnli.FindLanguage(text)
    if not result.probability:
        return default
    return lang.langcodes.bcp47_to_tess(result.language, default)

# This function converts a page from a PDF document (represented as a `fitz.Page`) into an image,
#allowing for optional scaling, and returns it as a Pillow (PIL) Image object.
def image_from_page(page: fitz.Page, scale: float = 1) -> Image:
    pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))  # type: ignore
    return Image.frombytes(  # type: ignore
        ("RGBA" if pix.alpha else "RGB"),
        (pix.width, pix.height), pix.samples
    )

#Returns the total area (in pixels) consumed by images that appear in `page`. Does not account for overlap
#between images, so it is possible for the total computed area to exceed the actual area of the page
def total_image_area(page: fitz.Page) -> int:
    return sum(
        rect.getArea()
        for image in page.get_images()
        for rect in page.get_image_rects(image)  # type: ignore
    )

#Returns the mean confidence by word of the OCR output given by metadata
def mean_conf(metadata: pd.DataFrame) -> float:
    if metadata is None:
        return 0
    valid_confs = metadata.conf[(metadata.conf >= 0) & is_text(metadata.text)]
    return valid_confs.mean() if len(valid_confs.index) > 0 else 0

# Returns a boolean array indicating which elements of `s` are text.
def is_text(s: Iterable[str]) -> pd.array:
    return pd.array([
        (isinstance(text, str) and (text.strip() != '')) for text in s
    ])

#Returns orientation and script data for `image`.
def osd(image: Image) -> dict:
    s = pytesseract.image_to_osd(image)
    ret = dict()
    for line in s.split('\n'):
        if line:
            key, value = line.split(':')
            key, value = key.strip(), value.strip()
            ret[key] = appropriate_type(value)
    return ret

# Returns a representation of `value` cast to the simplest possible type given its content.
def appropriate_type(value: Any) -> Any:
    try:
        return int(value)
    except ValueError:
        try:
            return float(value)
        except ValueError:
            return value

# Returns a `DataFrame` with the OCR output corresponding to `image`.
def data(image: Image, language: str, config: str = '') -> pd.DataFrame:
    s = str(pytesseract.image_to_data(image, lang=language, config=config))
    df = pd.read_csv(  # type: ignore
        StringIO(s), sep='\t', quoting=csv.QUOTE_NONE
    )
    df['language'] = [language] * len(df.index)
    return df

#Extracts a string from the metadata table column `words` that is identical to the one generated by
#`pytesseract.image_to_string`. Used to avoid redundant computations.

def data_to_string(words: Iterable[str]):
    text = ' '.join('\n' if pd.isna(word) else str(word) for word in words)
    single_newline = re.compile(r' \n ')
    multiple_newline = re.compile(r'( \n){2,} ')
    text = multiple_newline.sub('\n\n', text)
    text = single_newline.sub('\n', text)
    return text

OCR JupyterBook¶

What You'll Learn¶

How to Get Started¶

Why This Book?¶

What You'll Find Inside¶

Who Should Read This Book¶

Let's Begin!¶

I. Introduction¶

A. Working With CSV¶

B. Logging Our Tasks¶

II. Tessaract Walkthrough¶

III. OCR Demonstration¶

IV. OCR Working & Implementation¶

V. Transliteration¶

VI. Statistics Of Our Performance¶

Optional Module¶

VIII. Connect With Us¶