diff --git a/README.md b/README.md index 3171feb..24448f1 100644 --- a/README.md +++ b/README.md @@ -15,8 +15,14 @@ If an error occurs during the conversion of a table or column, an output with th ## Usage ``` -python convert.py [-h] [-v] [-s] -H HOST -P PORT -u USER -p PASSWORD -d DATABASE +python convert.py [-h] [-v] [-s | -V] -H HOST -P PORT -u USER -p PASSWORD -d DATABASE ``` + +Options: +- `-h/--help` +- `-s/--statistics` +- `-V/--validate` + Required arguments: - `-H/--host HOST` - `-P/--port PORT` @@ -25,6 +31,4 @@ Required arguments: - `-d/--database DATABASE` Optional arguments: -- `-h/--help` - `-v/--verbose` -- `-s/--statistics` \ No newline at end of file diff --git a/convert.py b/convert.py index 3c9a6db..fe41958 100644 --- a/convert.py +++ b/convert.py @@ -3,8 +3,10 @@ import logging import argparse +from json import dumps +from convert.validation import Validation from convert.statistics import Statistics -from convert.utf8mb4converter import UTF8MB4Converter +from convert.utf8mb4converter import UTF8MB4Converter, DEFAULT_CHARSET def main ( args: argparse.Namespace @@ -14,11 +16,11 @@ def main ( or converts the database itself, all tables and all text fields to utf8mb4 if they don't already have this character set. - Params: + Parameters: - args (argparse.Namespace) - Contains arguments passed to the program """ - + logger: logging.Logger = logging.getLogger("Main") db: UTF8MB4Converter = UTF8MB4Converter ( user = args.user, password = args.password, @@ -28,12 +30,16 @@ def main ( ) if args.statistics: - stats = Statistics(db) - logging.getLogger("Main").info(f"Database statistics:\n{stats}") + stats: Statistics = Statistics(db) + logger.info(f"Database statistics:\n{stats}") + + elif args.validate: + validator = Validation(db) + validation: dict = validator.convert_validate() + logger.info(f"Database conversion validation:\n{dumps(validation, indent=4)}") + else: - db.convert_charset_db() - db.convert_charset_all_columns_all_tables() - db.convert_charset_all_tables() + db.convert_charset_all() def parse_args ( ) -> argparse.Namespace: @@ -41,15 +47,18 @@ def parse_args ( Parses the arguments passed to the program. Returns: - - An argparse namespace containing the parsed arguments + - An argparse namespace containing the parsed arguments """ argparser: argparse.ArgumentParser = argparse.ArgumentParser() args_opt: argparse._ArgumentGroup = argparser.add_argument_group("Optional Arguments") args_req: argparse._ArgumentGroup = argparser.add_argument_group("Required Arguments") + args_exc: argparse._MutuallyExclusiveGroup = argparser.add_mutually_exclusive_group() args_opt.add_argument("-v", "--verbose", action="store_true") - args_opt.add_argument("-s", "--statistics", action="store_true") + + args_exc.add_argument("-s", "--statistics", action="store_true") + args_exc.add_argument("-V", "--validate", action="store_true") args_req.add_argument("-H", "--host", required=True) args_req.add_argument("-P", "--port", required=True, type=int) diff --git a/convert/statistics.py b/convert/statistics.py index 2db98fc..fd27af4 100644 --- a/convert/statistics.py +++ b/convert/statistics.py @@ -14,22 +14,29 @@ class Statistics: - The converter object storing the database information and connection - data (dict) - A dictionary holding the generated data: Number of tables & columns and character set overview + - charset (str): + - A string storing the target charset """ def __init__ ( self, - dbcon: UTF8MB4Converter + dbcon: UTF8MB4Converter, + charset: str = DEFAULT_CHARSET ) -> None: """ Constructor of Statistics object. Generates statistics at creation. Parameters: - dbcon (UTF8MB4Converter) - - The converter object storing the database information and connection + - The converter object storing the database information and connection + - charset (str): + - the target charset for comparison + - default: DEFAULT_CHARSET from class UTF8MB4Converter """ self.dbcon = dbcon self.data: dict = None + self.charset = charset self.update_stats() def __str__ ( @@ -101,12 +108,12 @@ class Statistics: }, "converted": { "tables": { - "converted": charset_tab[DEFAULT_CHARSET], - "missing": count_tab - charset_tab[DEFAULT_CHARSET] + "converted": charset_tab[self.charset], + "missing": count_tab - charset_tab[self.charset] }, "columns": { - "converted": charset_col[DEFAULT_CHARSET], - "missing": count_col - charset_col[DEFAULT_CHARSET] - charset_col[None] + "converted": charset_col[self.charset], + "missing": count_col - charset_col[self.charset] - charset_col[None] } } } \ No newline at end of file diff --git a/convert/utf8mb4converter.py b/convert/utf8mb4converter.py index 0885f12..845990a 100644 --- a/convert/utf8mb4converter.py +++ b/convert/utf8mb4converter.py @@ -291,12 +291,9 @@ class UTF8MB4Converter: self.logger.debug(f"Column {col}(@{table}) already has character set {charset}") return - if column['nullable'] == "YES": - constraint = "NULL" - else: - constraint = "NOT NULL" - if column['dvalue'] is not None: - constraint += f" DEFAULT {column['dvalue']}" + constraint = "NULL" if column["nullable"] == "YES" else "NOT NULL" + if column['dvalue'] is not None: + constraint += f" DEFAULT {column['dvalue']}" query = " ".join(( f"ALTER TABLE {table} CHANGE {col} {col}", @@ -355,4 +352,25 @@ class UTF8MB4Converter: tables = self.get_tables() for table in tables: - self.convert_charset_all_columns_single_table(table, charset, collation) \ No newline at end of file + self.convert_charset_all_columns_single_table(table, charset, collation) + + def convert_charset_all ( + self, + charset: str = DEFAULT_CHARSET, + collation: str = DEFAULT_COLLATION + ) -> None: + """ + Alters the charset and collation of the database, all columns and all tables + + Parameters: + - charset (str) + - target character set + - default value: utf8mb4 + - collation (str) + - target collation + - default value: utf8mb4_unicode_520_ci + """ + + self.convert_charset_db(charset, collation) + self.convert_charset_all_columns_all_tables(charset, collation) + self.convert_charset_all_tables(charset, collation) \ No newline at end of file diff --git a/convert/validation.py b/convert/validation.py new file mode 100644 index 0000000..21e83af --- /dev/null +++ b/convert/validation.py @@ -0,0 +1,190 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2023 Akumatic + +from collections import defaultdict +from convert.utf8mb4converter import UTF8MB4Converter + +class MissingStateException(Exception): + """ + Custom exception indicating a missing state from validation object. + """ + +class Validation: + """ + Class for validating the conversion of the database. The state of the database + before and after conversion is queried and compared. Deviations are shown. + + Attributes: + - dbcon (UTF8MB4Converter) + - The converter object storing the database information and connection + - start (defaultdict) + - A dictionary holding the state of the database, before conversion. + - end (defaultdict) + - A dictionary holding the state of the database, after conversion. + """ + + def __init__ ( + self, + dbcon: UTF8MB4Converter + ) -> None: + """ + Constructor of Validation object. Generates overview of the database at creation. + + Parameters: + - dbcon (UTF8MB4Converter) + - The converter object storing the database information and connection + """ + + self.dbcon: UTF8MB4Converter = dbcon + self.start: defaultdict = None + self.end: defaultdict = None + + def generate_start_state ( + self + ) -> None: + """ + Fetches current column schema of the database and stores it in the start attribute. + """ + + self.start = self._get_state() + + def generate_end_state ( + self + ) -> None: + """ + Fetches current column schema of the database and stores it in the end attribute. + """ + + self.end = self._get_state() + + def compare_states ( + self + ) -> dict: + """ + Compares start and end state and stores information about changed schemas. + generate_start_state and generate_end_state should be called first. + + Returns: + - A dict containing a numeric summary and details about mismatched columns. + + Raises: + - MissingStateException + - Raised when either start state or end state is not set + """ + + if self.start is None: + raise MissingStateException("No start state stored. Make sure to call generate_start_state") + if self.end is None: + raise MissingStateException("No end state stored. Make sure to call generate_end_state") + + summary: dict = {"unaltered": 0, "altered": 0} + details: defaultdict = defaultdict(dict) + + for table in self.start.keys(): + a: dict = self.start[table] + b: dict = self.end[table] + for column in a.keys(): + comp: dict = self._get_differences(a[column], b[column]) + if len(comp) == 0: + summary["unaltered"] += 1 + else: + summary["altered"] += 1 + details[table][column] = comp + + return {"summary": summary, "details": details} + + def convert_validate ( + self + ) -> dict: + """ + Alters the charset and collation of the database, all columns and all tables. + Validates that no other field was changed. + + Returns: + - A dict containing a numeric summary and details about mismatched columns. + """ + + self.generate_start_state() + self.dbcon.convert_charset_all() + self.generate_end_state() + return self.compare_states() + + def _get_differences ( + self, + a: dict, + b: dict + ) -> dict: + """ + Compares two given column data sets and compares the values for all keys + but for the fields changed by character set conversion (CHARACTER_SET_NAME, + COLLATION_NAME and CHARACTER_OCTET_LENGTH). Stores before and after value + for each column. + + Parameters: + - a (dict) + - A dictionary containing the information schema of a column. + - b (dict) + - A dictionary containing the information schema of a column + - Used for comparison with dictionary a + + Returns: + - A dict with the keys and values of deviations between the two given dicts, + ignoring certain values changed by character set conversion. + """ + + data = dict() + keys = a.keys() + for key in keys: + if key == "CHARACTER_SET_NAME": + continue + if key == "COLLATION_NAME": + continue + if key == "CHARACTER_OCTET_LENGTH": + continue + if a[key] != b[key]: + data[key] = { + "Before": a[key], + "After": b[key] + } + return data + + def _get_columns_of_table ( + self, + table: str + ) -> list: + """ + Fetches all information about the columns of a given table. + + Parameters: + - table (str) + - the tabke whose columns are to be retrieved + + Returns: + - A list of dicts, containing the full column information. + """ + + query = " ".join(( + "SELECT * FROM information_schema.COLUMNS", + f"WHERE table_schema = '{self.dbcon.db}' AND table_name = '{table}'" + )) + self.dbcon.kcursor.execute(query) + return self.dbcon.kcursor.fetchall() + + def _get_state ( + self + ) -> defaultdict: + """ + Fetches column schema of the database and stores it for each table and column. + + Returns: + - A defaultdict that contains one dictionary for each table. + Each table dictionary contains one dict per column, which contains the column schema. + """ + + state: defaultdict = defaultdict(dict) + tables: list = self.dbcon.get_tables() + for table in tables: + columns: list = self._get_columns_of_table(table) + for column in columns: + state[table][column["COLUMN_NAME"]] = column + return state \ No newline at end of file