Add Validator, fix default handling of column conversion, statistics stores charset for comparison

This commit is contained in:
Akumatic 2023-05-09 14:35:39 +02:00
parent 0194711016
commit ab30cb57ee
5 changed files with 254 additions and 26 deletions

View File

@ -15,8 +15,14 @@ If an error occurs during the conversion of a table or column, an output with th
## Usage
```
python convert.py [-h] [-v] [-s] -H HOST -P PORT -u USER -p PASSWORD -d DATABASE
python convert.py [-h] [-v] [-s | -V] -H HOST -P PORT -u USER -p PASSWORD -d DATABASE
```
Options:
- `-h/--help`
- `-s/--statistics`
- `-V/--validate`
Required arguments:
- `-H/--host HOST`
- `-P/--port PORT`
@ -25,6 +31,4 @@ Required arguments:
- `-d/--database DATABASE`
Optional arguments:
- `-h/--help`
- `-v/--verbose`
- `-s/--statistics`

View File

@ -3,8 +3,10 @@
import logging
import argparse
from json import dumps
from convert.validation import Validation
from convert.statistics import Statistics
from convert.utf8mb4converter import UTF8MB4Converter
from convert.utf8mb4converter import UTF8MB4Converter, DEFAULT_CHARSET
def main (
args: argparse.Namespace
@ -14,11 +16,11 @@ def main (
or converts the database itself, all tables and all text fields to utf8mb4 if they don't already
have this character set.
Params:
Parameters:
- args (argparse.Namespace)
- Contains arguments passed to the program
"""
logger: logging.Logger = logging.getLogger("Main")
db: UTF8MB4Converter = UTF8MB4Converter (
user = args.user,
password = args.password,
@ -28,12 +30,16 @@ def main (
)
if args.statistics:
stats = Statistics(db)
logging.getLogger("Main").info(f"Database statistics:\n{stats}")
stats: Statistics = Statistics(db)
logger.info(f"Database statistics:\n{stats}")
elif args.validate:
validator = Validation(db)
validation: dict = validator.convert_validate()
logger.info(f"Database conversion validation:\n{dumps(validation, indent=4)}")
else:
db.convert_charset_db()
db.convert_charset_all_columns_all_tables()
db.convert_charset_all_tables()
db.convert_charset_all()
def parse_args (
) -> argparse.Namespace:
@ -41,15 +47,18 @@ def parse_args (
Parses the arguments passed to the program.
Returns:
- An argparse namespace containing the parsed arguments
- An argparse namespace containing the parsed arguments
"""
argparser: argparse.ArgumentParser = argparse.ArgumentParser()
args_opt: argparse._ArgumentGroup = argparser.add_argument_group("Optional Arguments")
args_req: argparse._ArgumentGroup = argparser.add_argument_group("Required Arguments")
args_exc: argparse._MutuallyExclusiveGroup = argparser.add_mutually_exclusive_group()
args_opt.add_argument("-v", "--verbose", action="store_true")
args_opt.add_argument("-s", "--statistics", action="store_true")
args_exc.add_argument("-s", "--statistics", action="store_true")
args_exc.add_argument("-V", "--validate", action="store_true")
args_req.add_argument("-H", "--host", required=True)
args_req.add_argument("-P", "--port", required=True, type=int)

View File

@ -14,22 +14,29 @@ class Statistics:
- The converter object storing the database information and connection
- data (dict)
- A dictionary holding the generated data: Number of tables & columns and character set overview
- charset (str):
- A string storing the target charset
"""
def __init__ (
self,
dbcon: UTF8MB4Converter
dbcon: UTF8MB4Converter,
charset: str = DEFAULT_CHARSET
) -> None:
"""
Constructor of Statistics object. Generates statistics at creation.
Parameters:
- dbcon (UTF8MB4Converter)
- The converter object storing the database information and connection
- The converter object storing the database information and connection
- charset (str):
- the target charset for comparison
- default: DEFAULT_CHARSET from class UTF8MB4Converter
"""
self.dbcon = dbcon
self.data: dict = None
self.charset = charset
self.update_stats()
def __str__ (
@ -101,12 +108,12 @@ class Statistics:
},
"converted": {
"tables": {
"converted": charset_tab[DEFAULT_CHARSET],
"missing": count_tab - charset_tab[DEFAULT_CHARSET]
"converted": charset_tab[self.charset],
"missing": count_tab - charset_tab[self.charset]
},
"columns": {
"converted": charset_col[DEFAULT_CHARSET],
"missing": count_col - charset_col[DEFAULT_CHARSET] - charset_col[None]
"converted": charset_col[self.charset],
"missing": count_col - charset_col[self.charset] - charset_col[None]
}
}
}

View File

@ -291,12 +291,9 @@ class UTF8MB4Converter:
self.logger.debug(f"Column {col}(@{table}) already has character set {charset}")
return
if column['nullable'] == "YES":
constraint = "NULL"
else:
constraint = "NOT NULL"
if column['dvalue'] is not None:
constraint += f" DEFAULT {column['dvalue']}"
constraint = "NULL" if column["nullable"] == "YES" else "NOT NULL"
if column['dvalue'] is not None:
constraint += f" DEFAULT {column['dvalue']}"
query = " ".join((
f"ALTER TABLE {table} CHANGE {col} {col}",
@ -355,4 +352,25 @@ class UTF8MB4Converter:
tables = self.get_tables()
for table in tables:
self.convert_charset_all_columns_single_table(table, charset, collation)
self.convert_charset_all_columns_single_table(table, charset, collation)
def convert_charset_all (
self,
charset: str = DEFAULT_CHARSET,
collation: str = DEFAULT_COLLATION
) -> None:
"""
Alters the charset and collation of the database, all columns and all tables
Parameters:
- charset (str)
- target character set
- default value: utf8mb4
- collation (str)
- target collation
- default value: utf8mb4_unicode_520_ci
"""
self.convert_charset_db(charset, collation)
self.convert_charset_all_columns_all_tables(charset, collation)
self.convert_charset_all_tables(charset, collation)

190
convert/validation.py Normal file
View File

@ -0,0 +1,190 @@
# SPDX-License-Identifier: MIT
# Copyright (c) 2023 Akumatic
from collections import defaultdict
from convert.utf8mb4converter import UTF8MB4Converter
class MissingStateException(Exception):
"""
Custom exception indicating a missing state from validation object.
"""
class Validation:
"""
Class for validating the conversion of the database. The state of the database
before and after conversion is queried and compared. Deviations are shown.
Attributes:
- dbcon (UTF8MB4Converter)
- The converter object storing the database information and connection
- start (defaultdict)
- A dictionary holding the state of the database, before conversion.
- end (defaultdict)
- A dictionary holding the state of the database, after conversion.
"""
def __init__ (
self,
dbcon: UTF8MB4Converter
) -> None:
"""
Constructor of Validation object. Generates overview of the database at creation.
Parameters:
- dbcon (UTF8MB4Converter)
- The converter object storing the database information and connection
"""
self.dbcon: UTF8MB4Converter = dbcon
self.start: defaultdict = None
self.end: defaultdict = None
def generate_start_state (
self
) -> None:
"""
Fetches current column schema of the database and stores it in the start attribute.
"""
self.start = self._get_state()
def generate_end_state (
self
) -> None:
"""
Fetches current column schema of the database and stores it in the end attribute.
"""
self.end = self._get_state()
def compare_states (
self
) -> dict:
"""
Compares start and end state and stores information about changed schemas.
generate_start_state and generate_end_state should be called first.
Returns:
- A dict containing a numeric summary and details about mismatched columns.
Raises:
- MissingStateException
- Raised when either start state or end state is not set
"""
if self.start is None:
raise MissingStateException("No start state stored. Make sure to call generate_start_state")
if self.end is None:
raise MissingStateException("No end state stored. Make sure to call generate_end_state")
summary: dict = {"unaltered": 0, "altered": 0}
details: defaultdict = defaultdict(dict)
for table in self.start.keys():
a: dict = self.start[table]
b: dict = self.end[table]
for column in a.keys():
comp: dict = self._get_differences(a[column], b[column])
if len(comp) == 0:
summary["unaltered"] += 1
else:
summary["altered"] += 1
details[table][column] = comp
return {"summary": summary, "details": details}
def convert_validate (
self
) -> dict:
"""
Alters the charset and collation of the database, all columns and all tables.
Validates that no other field was changed.
Returns:
- A dict containing a numeric summary and details about mismatched columns.
"""
self.generate_start_state()
self.dbcon.convert_charset_all()
self.generate_end_state()
return self.compare_states()
def _get_differences (
self,
a: dict,
b: dict
) -> dict:
"""
Compares two given column data sets and compares the values for all keys
but for the fields changed by character set conversion (CHARACTER_SET_NAME,
COLLATION_NAME and CHARACTER_OCTET_LENGTH). Stores before and after value
for each column.
Parameters:
- a (dict)
- A dictionary containing the information schema of a column.
- b (dict)
- A dictionary containing the information schema of a column
- Used for comparison with dictionary a
Returns:
- A dict with the keys and values of deviations between the two given dicts,
ignoring certain values changed by character set conversion.
"""
data = dict()
keys = a.keys()
for key in keys:
if key == "CHARACTER_SET_NAME":
continue
if key == "COLLATION_NAME":
continue
if key == "CHARACTER_OCTET_LENGTH":
continue
if a[key] != b[key]:
data[key] = {
"Before": a[key],
"After": b[key]
}
return data
def _get_columns_of_table (
self,
table: str
) -> list:
"""
Fetches all information about the columns of a given table.
Parameters:
- table (str)
- the tabke whose columns are to be retrieved
Returns:
- A list of dicts, containing the full column information.
"""
query = " ".join((
"SELECT * FROM information_schema.COLUMNS",
f"WHERE table_schema = '{self.dbcon.db}' AND table_name = '{table}'"
))
self.dbcon.kcursor.execute(query)
return self.dbcon.kcursor.fetchall()
def _get_state (
self
) -> defaultdict:
"""
Fetches column schema of the database and stores it for each table and column.
Returns:
- A defaultdict that contains one dictionary for each table.
Each table dictionary contains one dict per column, which contains the column schema.
"""
state: defaultdict = defaultdict(dict)
tables: list = self.dbcon.get_tables()
for table in tables:
columns: list = self._get_columns_of_table(table)
for column in columns:
state[table][column["COLUMN_NAME"]] = column
return state