from bs4 import BeautifulSoup
from collections import OrderedDict

import re, html, math, json, lxml

class html_merge:
    def __init__(self, str_html):
        # Sets up html and constants
        self.soup = BeautifulSoup(str_html, 'lxml')
        self.good_soup = html.unescape(self.soup)
        self.multiplier_constant = 1.2364
        self.banished_words = ('st', 'rd', 'th', 'nd')

    def merge_elements(self):
        soup = self.good_soup.find('body')
        span_dict = OrderedDict()

        # Span should probably be renamed to element
        # This iterates through everything in the body element - divs and spans
        for span in soup.children:
            if span.name == 'div':
                # This could probably be changed since the div is not really being used
                # Elements are added to the dictionary
                span_dict[span.string] = span
            elif span.name == 'span':
                # If the current element is a span, then height, width, and top is searched for.
                # All spans contain width and top. Height is only used by border spans.
                height = re.search('(?<=height:)[^px]*', span['style'])
                width = re.search('(?<=width:)[^px]*', span['style'])
                current_top_px = int(re.search('(?<=top:)[^px]*', span['style']).group(0))

                if height is None:
                    # This if statement detects whether the span is a border or text and sets the threshold
                    # to an arbitrary number for a later if statement
                    height_threshold = 4
                else:
                    # If the span is a border, the threshold is set to the height, which is usually a large number
                    height_threshold = int(height.group(0))

                if height_threshold < 5:
                    # Since underlines are counted as borders, they will have a font size of 0 pixel, so
                    # any other borders -weird page borders generated by the parser- are useless and not needed.
                    # 5 is another arbitrary number because any text span will have a hard coded height threshold of 4 and is allowed to continue.
                    if not height_threshold:
                        # This detects if the current span is a border span (underlined) and changes the width so that it will
                        # align properly with the newly formatted text
                        new_width = math.floor(int(width.group(0)) * self.multiplier_constant)
                        span['style'] = span['style'].replace(str(width.group(0)), str(new_width))

                    left_px = int(re.search('(?<=left:)[^px]*', span['style']).group(0))
                    if current_top_px not in span_dict.keys():
                        # This adds the current text span to the dictionary.
                        # The span's top pixel position is used as the key and the value is the span itself.
                        # This is because we want one span to contain the letters of every span with the same top position. 
                        # The span's string value will be a json of a dictionary. This is because the parsed html document
                        # is not always ordered correctly. There are some elements that are found later in the document
                        # that should be related to other spans with the same top position. For example: 
                        # <span style="top:100px; left:10px"> F </span>
                        # <span style="top:100px; left:15px"> o </span>
                        # ...
                        # <span style="top:200px; left:10px"> B </span>
                        # <span style="top:200px; left:15px"> a </span>
                        # <span style="top:100px; left:20px"> o </span>    This should have gone after the 2nd span.
                        # So having span as a dictionary of left pixel as the key and the letter as the value makes it
                        # easier to combine the words so that is is ordered. From "Fo ... Bao" without dictionary to "Foo Ba" with dictionary.
                        letter = span.string
                        span.string = json.dumps(OrderedDict({left_px: letter}))
                        span_dict[current_top_px] = span
                    else:
                        # If the current top pixel is already in the dictionary's key, then it should add on the the existing span
                        # I could probably combine this into an elif but I don't remember why I had this as a seperate if statement ...
                        if span_dict[current_top_px].string:
                            # This is from old code, so it might not be needed anymore. But this was here because
                            # span.string could be None, and we only want to add letters. Since span.string (not the current span) is a dictionary
                            # and not a letter anymore, this might not be needed. Anyways, the current span.string is a letter so it is added
                            # to the dictionary.
                            letter_dict = json.loads(span_dict[current_top_px].string)
                            letter_dict[left_px] = span.string
                            new_string = json.dumps(letter_dict)
                            span_dict[current_top_px].string = new_string
                else:
                    # If the current span is a border span that is also not an underline border span, it should not be added to the dictionary.
                    pass
            else:
                # If the element is not a div or span then this will execute.
                # Not sure if the parser creates any other type of tag, but this
                # can be changed in the future to reflect other tags.
                print('not div or span')
        return span_dict

    def combine_elements_to_html(self, span_dict):
        # There's probably a better way to turn everything back into an html element with bs4
        finished_html = ""
        # I should probably change span to element too.
        for span in span_dict.values():
            if span.name == 'div':
                # There is a try except here because we want to nest the spans into divs.
                # This essentially is for skipping the first iteration so that the div can be created
                # and spans can be nested. 
                try:
                    finished_html += str(new_div)
                except NameError:
                    pass

                # A div is created if the element is a div.
                # TODO: Instead of creating a new div, use the old div and change the attributes.
                # This div won't be added to the finished html until the next div appears, so all the spans that belong to that
                # div can be nested.
                new_div = self.good_soup.new_tag('div')
                new_div.attrs['class'] = 'paper'
                new_div.attrs['style'] = 'position: relative; width: 100%; height: 100%;'
            else:
                # If the element is a span, then we want to combine all the letters in span.string's dictionary.
                word_dict = json.loads(span.string)
                word = ''
                spans = []
                # Sorts the dictionary's keys so that the lowest left position will be first. This is here
                # because of the same reason at @line 54. We want most left position start the sentence so
                # that everything will be aligned correctly. 
                sorted_keys = sorted(word_dict.keys(), key=int)

                height = re.search('(?<=height:)[^px]*', span['style'])
                current_span_left = re.search('(?<=left:)[^px]*', span['style']).group(0)
                # This gets the most left position
                first_left = sorted_keys[0]
                if int(current_span_left) > int(first_left):
                    # Because of the reason @line 54, sometimes the current span's left position will not be the most left
                    # position, so it needs to be updated to the correct position.
                    span['style'] = span['style'].replace(current_span_left, first_left)

                len_keys = len(sorted_keys)
                for i in range(len_keys):
                    # This for loops is to combine all the letters into words/sentences.
                    current_key = sorted_keys[i]
                    # An if statement could probably work here too, but try except is a good lazy solution, but less efficient.
                    try:
                        # This try block is to detect whether there should be white space in the document
                        # For example, if the a converted pdf document looks like (if it is a table):
                        # "Please sign here:________________________                   here:________________________ "
                        # This try block will try to keep that space between the heres by detecting whether the left position of
                        # the current letter and the next is more than some arbitrary number.
                        # Without this try block, the parsed html will look like:
                        # "Please sign here:________________________ here:________________________ "
                        next_key = sorted_keys[i + 1]
                        current_letter = word_dict[current_key]
                        next_letter = word_dict[next_key]

                        px_diff = int(next_key) - int(current_key)
                        if px_diff > 20:
                            if word:
                                # If a word exists, then we want to append it to spans, then we want to create a 
                                # new span for white space.
                                prev_span = self.good_soup.new_tag('span')
                                prev_span.string = word
                                spans.append(prev_span)
                                word = ''

                            # A span is created and inserted to mimick the white space
                            new_span = self.good_soup.new_tag('span')
                            new_span.attrs['style'] = 'width:{}px;display:inline-block;'.format(px_diff * self.multiplier_constant)
                            spans.append(new_span)
                        else:
                            # If there is no gap, then all the letters should be added after each other to form words and sentences.
                            current_letter = word_dict[current_key]
                            if current_letter is not None:
                                word += current_letter
                    except IndexError:
                        # This is here to finish combining letters.
                        current_letter = word_dict[current_key]
                        if current_letter is not None:
                            word += current_letter

                # This is for nesting the spans. The current span is to style it correctly, while
                # the nested span contains the words/sentences.
                new_span = self.good_soup.new_tag('span')
                new_span.string = word
                if word != '' and word != "":
                    # This if statement is here just reduce the number of blank nested spans and to
                    # nest the spans
                    spans.append(new_span)
                    span.contents = spans
                else:
                    span.contents = []

                if height is None:
                    # This adds extra attributes to word containing spans for formatting.
                    extra_attrs = 'width: inherit;'
                    span['style'] += extra_attrs
                    span.attrs['class'] = 'line'

                if not(len(word) == 2 and word in self.banished_words):
                    # This is here as a hack to get rid of number suffixes (rd, st, th, nd) from 1^st, 2^nd, 3^rd, 4^th because
                    # it doesn't have the correct formatting, and fixing it will be hard. After the current set of spans are combined, it is
                    # inserted in the div, and then the cycle continues.
                    new_div.contents.append(span)

        return finished_html

    def run(self):
        merged = self.merge_elements()
        combined = self.combine_elements_to_html(merged)
        return combined

def main():
    print('hello world')

if __name__ == '__main__':
    main()
