Add Validator, fix default handling of column conversion, statistics stores charset for comparison

This commit is contained in:
Akumatic 2023-05-09 14:35:39 +02:00
parent 0194711016
commit ab30cb57ee
5 changed files with 254 additions and 26 deletions

View File

@ -15,8 +15,14 @@ If an error occurs during the conversion of a table or column, an output with th
## Usage ## Usage
``` ```
python convert.py [-h] [-v] [-s] -H HOST -P PORT -u USER -p PASSWORD -d DATABASE python convert.py [-h] [-v] [-s | -V] -H HOST -P PORT -u USER -p PASSWORD -d DATABASE
``` ```
Options:
- `-h/--help`
- `-s/--statistics`
- `-V/--validate`
Required arguments: Required arguments:
- `-H/--host HOST` - `-H/--host HOST`
- `-P/--port PORT` - `-P/--port PORT`
@ -25,6 +31,4 @@ Required arguments:
- `-d/--database DATABASE` - `-d/--database DATABASE`
Optional arguments: Optional arguments:
- `-h/--help`
- `-v/--verbose` - `-v/--verbose`
- `-s/--statistics`

View File

@ -3,8 +3,10 @@
import logging import logging
import argparse import argparse
from json import dumps
from convert.validation import Validation
from convert.statistics import Statistics from convert.statistics import Statistics
from convert.utf8mb4converter import UTF8MB4Converter from convert.utf8mb4converter import UTF8MB4Converter, DEFAULT_CHARSET
def main ( def main (
args: argparse.Namespace args: argparse.Namespace
@ -14,11 +16,11 @@ def main (
or converts the database itself, all tables and all text fields to utf8mb4 if they don't already or converts the database itself, all tables and all text fields to utf8mb4 if they don't already
have this character set. have this character set.
Params: Parameters:
- args (argparse.Namespace) - args (argparse.Namespace)
- Contains arguments passed to the program - Contains arguments passed to the program
""" """
logger: logging.Logger = logging.getLogger("Main")
db: UTF8MB4Converter = UTF8MB4Converter ( db: UTF8MB4Converter = UTF8MB4Converter (
user = args.user, user = args.user,
password = args.password, password = args.password,
@ -28,12 +30,16 @@ def main (
) )
if args.statistics: if args.statistics:
stats = Statistics(db) stats: Statistics = Statistics(db)
logging.getLogger("Main").info(f"Database statistics:\n{stats}") logger.info(f"Database statistics:\n{stats}")
elif args.validate:
validator = Validation(db)
validation: dict = validator.convert_validate()
logger.info(f"Database conversion validation:\n{dumps(validation, indent=4)}")
else: else:
db.convert_charset_db() db.convert_charset_all()
db.convert_charset_all_columns_all_tables()
db.convert_charset_all_tables()
def parse_args ( def parse_args (
) -> argparse.Namespace: ) -> argparse.Namespace:
@ -41,15 +47,18 @@ def parse_args (
Parses the arguments passed to the program. Parses the arguments passed to the program.
Returns: Returns:
- An argparse namespace containing the parsed arguments - An argparse namespace containing the parsed arguments
""" """
argparser: argparse.ArgumentParser = argparse.ArgumentParser() argparser: argparse.ArgumentParser = argparse.ArgumentParser()
args_opt: argparse._ArgumentGroup = argparser.add_argument_group("Optional Arguments") args_opt: argparse._ArgumentGroup = argparser.add_argument_group("Optional Arguments")
args_req: argparse._ArgumentGroup = argparser.add_argument_group("Required Arguments") args_req: argparse._ArgumentGroup = argparser.add_argument_group("Required Arguments")
args_exc: argparse._MutuallyExclusiveGroup = argparser.add_mutually_exclusive_group()
args_opt.add_argument("-v", "--verbose", action="store_true") args_opt.add_argument("-v", "--verbose", action="store_true")
args_opt.add_argument("-s", "--statistics", action="store_true")
args_exc.add_argument("-s", "--statistics", action="store_true")
args_exc.add_argument("-V", "--validate", action="store_true")
args_req.add_argument("-H", "--host", required=True) args_req.add_argument("-H", "--host", required=True)
args_req.add_argument("-P", "--port", required=True, type=int) args_req.add_argument("-P", "--port", required=True, type=int)

View File

@ -14,22 +14,29 @@ class Statistics:
- The converter object storing the database information and connection - The converter object storing the database information and connection
- data (dict) - data (dict)
- A dictionary holding the generated data: Number of tables & columns and character set overview - A dictionary holding the generated data: Number of tables & columns and character set overview
- charset (str):
- A string storing the target charset
""" """
def __init__ ( def __init__ (
self, self,
dbcon: UTF8MB4Converter dbcon: UTF8MB4Converter,
charset: str = DEFAULT_CHARSET
) -> None: ) -> None:
""" """
Constructor of Statistics object. Generates statistics at creation. Constructor of Statistics object. Generates statistics at creation.
Parameters: Parameters:
- dbcon (UTF8MB4Converter) - dbcon (UTF8MB4Converter)
- The converter object storing the database information and connection - The converter object storing the database information and connection
- charset (str):
- the target charset for comparison
- default: DEFAULT_CHARSET from class UTF8MB4Converter
""" """
self.dbcon = dbcon self.dbcon = dbcon
self.data: dict = None self.data: dict = None
self.charset = charset
self.update_stats() self.update_stats()
def __str__ ( def __str__ (
@ -101,12 +108,12 @@ class Statistics:
}, },
"converted": { "converted": {
"tables": { "tables": {
"converted": charset_tab[DEFAULT_CHARSET], "converted": charset_tab[self.charset],
"missing": count_tab - charset_tab[DEFAULT_CHARSET] "missing": count_tab - charset_tab[self.charset]
}, },
"columns": { "columns": {
"converted": charset_col[DEFAULT_CHARSET], "converted": charset_col[self.charset],
"missing": count_col - charset_col[DEFAULT_CHARSET] - charset_col[None] "missing": count_col - charset_col[self.charset] - charset_col[None]
} }
} }
} }

View File

@ -291,12 +291,9 @@ class UTF8MB4Converter:
self.logger.debug(f"Column {col}(@{table}) already has character set {charset}") self.logger.debug(f"Column {col}(@{table}) already has character set {charset}")
return return
if column['nullable'] == "YES": constraint = "NULL" if column["nullable"] == "YES" else "NOT NULL"
constraint = "NULL" if column['dvalue'] is not None:
else: constraint += f" DEFAULT {column['dvalue']}"
constraint = "NOT NULL"
if column['dvalue'] is not None:
constraint += f" DEFAULT {column['dvalue']}"
query = " ".join(( query = " ".join((
f"ALTER TABLE {table} CHANGE {col} {col}", f"ALTER TABLE {table} CHANGE {col} {col}",
@ -355,4 +352,25 @@ class UTF8MB4Converter:
tables = self.get_tables() tables = self.get_tables()
for table in tables: for table in tables:
self.convert_charset_all_columns_single_table(table, charset, collation) self.convert_charset_all_columns_single_table(table, charset, collation)
def convert_charset_all (
self,
charset: str = DEFAULT_CHARSET,
collation: str = DEFAULT_COLLATION
) -> None:
"""
Alters the charset and collation of the database, all columns and all tables
Parameters:
- charset (str)
- target character set
- default value: utf8mb4
- collation (str)
- target collation
- default value: utf8mb4_unicode_520_ci
"""
self.convert_charset_db(charset, collation)
self.convert_charset_all_columns_all_tables(charset, collation)
self.convert_charset_all_tables(charset, collation)

190
convert/validation.py Normal file
View File

@ -0,0 +1,190 @@
# SPDX-License-Identifier: MIT
# Copyright (c) 2023 Akumatic
from collections import defaultdict
from convert.utf8mb4converter import UTF8MB4Converter
class MissingStateException(Exception):
"""
Custom exception indicating a missing state from validation object.
"""
class Validation:
"""
Class for validating the conversion of the database. The state of the database
before and after conversion is queried and compared. Deviations are shown.
Attributes:
- dbcon (UTF8MB4Converter)
- The converter object storing the database information and connection
- start (defaultdict)
- A dictionary holding the state of the database, before conversion.
- end (defaultdict)
- A dictionary holding the state of the database, after conversion.
"""
def __init__ (
self,
dbcon: UTF8MB4Converter
) -> None:
"""
Constructor of Validation object. Generates overview of the database at creation.
Parameters:
- dbcon (UTF8MB4Converter)
- The converter object storing the database information and connection
"""
self.dbcon: UTF8MB4Converter = dbcon
self.start: defaultdict = None
self.end: defaultdict = None
def generate_start_state (
self
) -> None:
"""
Fetches current column schema of the database and stores it in the start attribute.
"""
self.start = self._get_state()
def generate_end_state (
self
) -> None:
"""
Fetches current column schema of the database and stores it in the end attribute.
"""
self.end = self._get_state()
def compare_states (
self
) -> dict:
"""
Compares start and end state and stores information about changed schemas.
generate_start_state and generate_end_state should be called first.
Returns:
- A dict containing a numeric summary and details about mismatched columns.
Raises:
- MissingStateException
- Raised when either start state or end state is not set
"""
if self.start is None:
raise MissingStateException("No start state stored. Make sure to call generate_start_state")
if self.end is None:
raise MissingStateException("No end state stored. Make sure to call generate_end_state")
summary: dict = {"unaltered": 0, "altered": 0}
details: defaultdict = defaultdict(dict)
for table in self.start.keys():
a: dict = self.start[table]
b: dict = self.end[table]
for column in a.keys():
comp: dict = self._get_differences(a[column], b[column])
if len(comp) == 0:
summary["unaltered"] += 1
else:
summary["altered"] += 1
details[table][column] = comp
return {"summary": summary, "details": details}
def convert_validate (
self
) -> dict:
"""
Alters the charset and collation of the database, all columns and all tables.
Validates that no other field was changed.
Returns:
- A dict containing a numeric summary and details about mismatched columns.
"""
self.generate_start_state()
self.dbcon.convert_charset_all()
self.generate_end_state()
return self.compare_states()
def _get_differences (
self,
a: dict,
b: dict
) -> dict:
"""
Compares two given column data sets and compares the values for all keys
but for the fields changed by character set conversion (CHARACTER_SET_NAME,
COLLATION_NAME and CHARACTER_OCTET_LENGTH). Stores before and after value
for each column.
Parameters:
- a (dict)
- A dictionary containing the information schema of a column.
- b (dict)
- A dictionary containing the information schema of a column
- Used for comparison with dictionary a
Returns:
- A dict with the keys and values of deviations between the two given dicts,
ignoring certain values changed by character set conversion.
"""
data = dict()
keys = a.keys()
for key in keys:
if key == "CHARACTER_SET_NAME":
continue
if key == "COLLATION_NAME":
continue
if key == "CHARACTER_OCTET_LENGTH":
continue
if a[key] != b[key]:
data[key] = {
"Before": a[key],
"After": b[key]
}
return data
def _get_columns_of_table (
self,
table: str
) -> list:
"""
Fetches all information about the columns of a given table.
Parameters:
- table (str)
- the tabke whose columns are to be retrieved
Returns:
- A list of dicts, containing the full column information.
"""
query = " ".join((
"SELECT * FROM information_schema.COLUMNS",
f"WHERE table_schema = '{self.dbcon.db}' AND table_name = '{table}'"
))
self.dbcon.kcursor.execute(query)
return self.dbcon.kcursor.fetchall()
def _get_state (
self
) -> defaultdict:
"""
Fetches column schema of the database and stores it for each table and column.
Returns:
- A defaultdict that contains one dictionary for each table.
Each table dictionary contains one dict per column, which contains the column schema.
"""
state: defaultdict = defaultdict(dict)
tables: list = self.dbcon.get_tables()
for table in tables:
columns: list = self._get_columns_of_table(table)
for column in columns:
state[table][column["COLUMN_NAME"]] = column
return state