Skip to content

Package data processing

This module provides managing classes and methods for the tool.

BedGraph

A BedGraph object holds information and process bedgraph files with coverage.

Attributes:

  • filepath (str) –

    path to a bedgraph file

  • contig_sizes (dict) –

    Dictionary with contig sizes

  • coverage (dict) –

    Dictionary with format: key - contig id, value - pd.Series with coverage counts.

  • parameters (Parameters) –

    Parameters' class object that holds config and cmd arguments.

Source code in lovis4u/DataProcessing.py
class BedGraph:
    """A BedGraph object holds information and process bedgraph files with coverage.

    Attributes:
        filepath (str): path to a bedgraph file
        contig_sizes (dict): Dictionary with contig sizes
        coverage (dict):  Dictionary with format: key - contig id, value - pd.Series with coverage counts.
        parameters (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.

    """

    def __init__(self, filepath: str, contig_sizes: dict, parameters: lovis4u.Manager.Parameters):
        """Create a BedGraph object

        Arguments:
            filepath (str): path to a bedgraph file.
            contig_sizes:  Dictionary with format: key - contig id, value - full length of the corresponding contig.
            parameters (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.

        """
        self.filepath = filepath
        self.prms = parameters
        self.contig_sizes = contig_sizes
        self.coverage = self.__read_file()

    def __read_file(self):
        """Read correspnding bedgraph file.

        Returns:
            coverage (dict):  Dictionary with format: key - contig id, value - pd.Series with coverage counts.
        """
        try:
            coverage = dict()
            for contig_id, length in self.contig_sizes.items():
                coverage[contig_id] = pd.Series(np.zeros(length, dtype=int))

            with open(self.filepath, "r") as bedgraph:
                for line in bedgraph:
                    contig, start, end, counts = line.strip().split("\t")
                    if contig in coverage.keys():
                        coverage[contig][int(start):int(end)] = int(counts)

            ids_to_remove = [contig_id for contig_id, cov in coverage.items() if cov.sum() == 0]
            for contig_id in ids_to_remove:
                del coverage[contig_id]

            return coverage
        except Exception as error:
            raise lovis4u.Manager.lovis4uError(f"Unable to read {self.filepath} bedgraph file.") from error

    def get_window_coverage(self, contig, start, end):
        """Get coverage for a particular window.

        Arguments:
            contig (str): contig id
            start (int): start coordinate (0-based)
            end (int): end coordinate (0-based)
        """
        try:
            window_coverage = self.coverage[contig][start:end]
            return window_coverage
        except Exception as error:
            raise lovis4u.Manager.lovis4uError(f"Unable to extract window coverage {contig, start, end} for "
                                               f"{self.filepath} bedgraph file.") from error

__init__(filepath, contig_sizes, parameters)

Create a BedGraph object

Parameters:

  • filepath (str) –

    path to a bedgraph file.

  • contig_sizes (dict) –

    Dictionary with format: key - contig id, value - full length of the corresponding contig.

  • parameters (Parameters) –

    Parameters' class object that holds config and cmd arguments.

Source code in lovis4u/DataProcessing.py
def __init__(self, filepath: str, contig_sizes: dict, parameters: lovis4u.Manager.Parameters):
    """Create a BedGraph object

    Arguments:
        filepath (str): path to a bedgraph file.
        contig_sizes:  Dictionary with format: key - contig id, value - full length of the corresponding contig.
        parameters (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.

    """
    self.filepath = filepath
    self.prms = parameters
    self.contig_sizes = contig_sizes
    self.coverage = self.__read_file()

__read_file()

Read correspnding bedgraph file.

Returns:

  • coverage ( dict ) –

    Dictionary with format: key - contig id, value - pd.Series with coverage counts.

Source code in lovis4u/DataProcessing.py
def __read_file(self):
    """Read correspnding bedgraph file.

    Returns:
        coverage (dict):  Dictionary with format: key - contig id, value - pd.Series with coverage counts.
    """
    try:
        coverage = dict()
        for contig_id, length in self.contig_sizes.items():
            coverage[contig_id] = pd.Series(np.zeros(length, dtype=int))

        with open(self.filepath, "r") as bedgraph:
            for line in bedgraph:
                contig, start, end, counts = line.strip().split("\t")
                if contig in coverage.keys():
                    coverage[contig][int(start):int(end)] = int(counts)

        ids_to_remove = [contig_id for contig_id, cov in coverage.items() if cov.sum() == 0]
        for contig_id in ids_to_remove:
            del coverage[contig_id]

        return coverage
    except Exception as error:
        raise lovis4u.Manager.lovis4uError(f"Unable to read {self.filepath} bedgraph file.") from error

get_window_coverage(contig, start, end)

Get coverage for a particular window.

Parameters:

  • contig (str) –

    contig id

  • start (int) –

    start coordinate (0-based)

  • end (int) –

    end coordinate (0-based)

Source code in lovis4u/DataProcessing.py
def get_window_coverage(self, contig, start, end):
    """Get coverage for a particular window.

    Arguments:
        contig (str): contig id
        start (int): start coordinate (0-based)
        end (int): end coordinate (0-based)
    """
    try:
        window_coverage = self.coverage[contig][start:end]
        return window_coverage
    except Exception as error:
        raise lovis4u.Manager.lovis4uError(f"Unable to extract window coverage {contig, start, end} for "
                                           f"{self.filepath} bedgraph file.") from error

CoverageProfiles

A CoverageProfiles object holds information about a set of bedgraph profiles

Attributes:

  • bedgraph_files (list) –

    list with paths to bedgraph files.

  • contig_sizes

    Dictionary with format: key - contig id, value - full length of the corresponding contig.

  • bedgraphs (list) –

    list with BedGraph objects

  • parameters (Parameters) –

    Parameters' class object that holds config and cmd arguments.

Source code in lovis4u/DataProcessing.py
class CoverageProfiles:
    """A CoverageProfiles object holds information about a set of bedgraph profiles

    Attributes:
            bedgraph_files (list): list with paths to bedgraph files.
            contig_sizes:  Dictionary with format: key - contig id, value - full length of the corresponding contig.
            bedgraphs (list): list with BedGraph objects
            parameters (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.

    """

    def __init__(self, bedgraph_files: list, contig_sizes: dict, parameters: lovis4u.Manager.Parameters):
        """Create a CoverageProfiles object

        Arguments:
            bedgraph_files (list): list with paths to bedgraph files.
            parameters (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.
            contig_sizes:  Dictionary with format: key - contig id, value - full length of the corresponding contig.

        """
        self.bedgraph_files = bedgraph_files
        self.prms = parameters
        self.contig_sizes = contig_sizes
        self.bedgraphs = [BedGraph(filepath, contig_sizes, parameters) for filepath in bedgraph_files]

__init__(bedgraph_files, contig_sizes, parameters)

Create a CoverageProfiles object

Parameters:

  • bedgraph_files (list) –

    list with paths to bedgraph files.

  • parameters (Parameters) –

    Parameters' class object that holds config and cmd arguments.

  • contig_sizes (dict) –

    Dictionary with format: key - contig id, value - full length of the corresponding contig.

Source code in lovis4u/DataProcessing.py
def __init__(self, bedgraph_files: list, contig_sizes: dict, parameters: lovis4u.Manager.Parameters):
    """Create a CoverageProfiles object

    Arguments:
        bedgraph_files (list): list with paths to bedgraph files.
        parameters (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.
        contig_sizes:  Dictionary with format: key - contig id, value - full length of the corresponding contig.

    """
    self.bedgraph_files = bedgraph_files
    self.prms = parameters
    self.contig_sizes = contig_sizes
    self.bedgraphs = [BedGraph(filepath, contig_sizes, parameters) for filepath in bedgraph_files]

Feature

A Feature object represents a locus' feature and its properties.

Attributes:

  • feature_id (str) –

    Feature identifier.

  • feature_type (str) –

    Type of element (e.g. CDS or tRNA).

  • start (int) –

    1-based start genomic coordinate.

  • end (int) –

    1-based end genomic coordinates

  • strand (int) –

    Genomic strand (1: plus strand, -1: minus strand).

  • name (str) –

    Name of the feature which will be used as a label.

  • sequence (Seq) –

    Feature's sequence.

  • record (SeqRecord) –

    SeqRecord object of corresponding feature sequence.

  • group (str) –

    Feature group that defines feature's colour and meant to represent a set of homologous features. Can be set with Loci.mmseqs_cluster() method that uses mmseqs clustering to define CDS feature groups.

  • group_type (str) –

    Type of feature group that allow to visualise different set of feature groups differently (e.g. plot labels only for features which group_type is "variable" or "labeled").

  • category (str) –

    Feature category which initially is built to handle phrogs category annotation. In visualisation it defines the "category" colour annotation under features. Supposed to represent clusters on locus or any second layer of feature properties.

  • vis_prms (dict) –

    Visualisation parameters that holds colours, label and other info for Drawing methods.

  • overlapping (bool) –

    Whether feature overlaps with visualised region or not.

  • prms (Parameters) –

    Parameters' class object that holds config and cmd arguments.

Source code in lovis4u/DataProcessing.py
class Feature:
    """A Feature object represents a locus' feature and its properties.

    Attributes:
        feature_id (str): Feature identifier.
        feature_type (str): Type of element (e.g. CDS or tRNA).
        start (int): 1-based start genomic coordinate.
        end (int): 1-based end genomic coordinates
        strand (int): Genomic strand (1: plus strand, -1: minus strand).
        name (str): Name of the feature which will be used as a label.
        sequence (Bio.Seq.Seq): Feature's sequence.
        record (Bio.SeqRecord.SeqRecord): SeqRecord object of corresponding feature sequence.
        group (str): Feature group that defines feature's colour and meant to represent a set of homologous features.
            Can be set with Loci.mmseqs_cluster() method that uses mmseqs clustering to define CDS feature groups.
        group_type (str): Type of feature group that allow to visualise different set of feature groups differently
            (e.g. plot labels only for features which group_type is "variable" or "labeled").
        category (str): Feature category which initially is built to handle phrogs category annotation.
            In visualisation it defines the "category" colour annotation under features.
            Supposed to represent clusters on locus or any second layer of feature properties.
        vis_prms (dict): Visualisation parameters that holds colours, label and other info for Drawing methods.
        overlapping (bool): Whether feature overlaps with visualised region or not.
        prms (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.

    """

    def __init__(self, feature_id: str, feature_type: str, start: int, end: int, strand: int, name: str,
                 sequence: Bio.Seq.Seq, group: str, group_type: str, category: str, vis_prms: dict, overlapping: bool,
                 parameters: lovis4u.Manager.Parameters):
        """Create a Feature object.

        Arguments:
            feature_id (str): Feature identifier.
            feature_type (str): Type of element (e.g. CDS or tRNA). Currently only CDS are supported.
            start (int): 1-based start genomic coordinate.
            end (int): 1-based end genomic coordinates
            strand (int): Genomic strand (1: plus strand, -1: minus strand).
            name (str): Name of the feature which will be used as a label.
            sequence (Bio.Seq.Seq): Feature's sequence.
            group (str): Feature group that defines feature's colour and meant to represent a set of homologous features.
                Can be set with Loci.mmseqs_cluster() method that uses mmseqs clustering to define CDS feature groups.
            group_type (str): Type of feature group that allow to visualise different set of feature groups differently
                (e.g. plot labels only for features which group_type is "variable" or "labeled").
            category (str): Feature category which initially is built to handle phrogs category annotation.
                In visualisation it defines the "category" colour annotation under features.
                Supposed to represent clusters on locus or any second layer of feature properties.
            vis_prms (dict): Visualisation parameters that holds colours, label and other info for Drawing methods.
            prms (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.

        """
        self.feature_type = feature_type
        self.feature_id = feature_id
        self.start = start
        self.end = end
        self.strand = strand
        self.name = name
        self.sequence = sequence
        self.record = Bio.SeqRecord.SeqRecord(seq=self.sequence, id=self.feature_id)
        self.group = group  # maybe rename later
        self.group_type = group_type
        self.category = category
        self.vis_prms = vis_prms
        self.vis_prms["label"] = str(name)
        self.overlapping = overlapping
        self.prms = parameters

__init__(feature_id, feature_type, start, end, strand, name, sequence, group, group_type, category, vis_prms, overlapping, parameters)

Create a Feature object.

Parameters:

  • feature_id (str) –

    Feature identifier.

  • feature_type (str) –

    Type of element (e.g. CDS or tRNA). Currently only CDS are supported.

  • start (int) –

    1-based start genomic coordinate.

  • end (int) –

    1-based end genomic coordinates

  • strand (int) –

    Genomic strand (1: plus strand, -1: minus strand).

  • name (str) –

    Name of the feature which will be used as a label.

  • sequence (Seq) –

    Feature's sequence.

  • group (str) –

    Feature group that defines feature's colour and meant to represent a set of homologous features. Can be set with Loci.mmseqs_cluster() method that uses mmseqs clustering to define CDS feature groups.

  • group_type (str) –

    Type of feature group that allow to visualise different set of feature groups differently (e.g. plot labels only for features which group_type is "variable" or "labeled").

  • category (str) –

    Feature category which initially is built to handle phrogs category annotation. In visualisation it defines the "category" colour annotation under features. Supposed to represent clusters on locus or any second layer of feature properties.

  • vis_prms (dict) –

    Visualisation parameters that holds colours, label and other info for Drawing methods.

  • prms (Parameters) –

    Parameters' class object that holds config and cmd arguments.

Source code in lovis4u/DataProcessing.py
def __init__(self, feature_id: str, feature_type: str, start: int, end: int, strand: int, name: str,
             sequence: Bio.Seq.Seq, group: str, group_type: str, category: str, vis_prms: dict, overlapping: bool,
             parameters: lovis4u.Manager.Parameters):
    """Create a Feature object.

    Arguments:
        feature_id (str): Feature identifier.
        feature_type (str): Type of element (e.g. CDS or tRNA). Currently only CDS are supported.
        start (int): 1-based start genomic coordinate.
        end (int): 1-based end genomic coordinates
        strand (int): Genomic strand (1: plus strand, -1: minus strand).
        name (str): Name of the feature which will be used as a label.
        sequence (Bio.Seq.Seq): Feature's sequence.
        group (str): Feature group that defines feature's colour and meant to represent a set of homologous features.
            Can be set with Loci.mmseqs_cluster() method that uses mmseqs clustering to define CDS feature groups.
        group_type (str): Type of feature group that allow to visualise different set of feature groups differently
            (e.g. plot labels only for features which group_type is "variable" or "labeled").
        category (str): Feature category which initially is built to handle phrogs category annotation.
            In visualisation it defines the "category" colour annotation under features.
            Supposed to represent clusters on locus or any second layer of feature properties.
        vis_prms (dict): Visualisation parameters that holds colours, label and other info for Drawing methods.
        prms (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.

    """
    self.feature_type = feature_type
    self.feature_id = feature_id
    self.start = start
    self.end = end
    self.strand = strand
    self.name = name
    self.sequence = sequence
    self.record = Bio.SeqRecord.SeqRecord(seq=self.sequence, id=self.feature_id)
    self.group = group  # maybe rename later
    self.group_type = group_type
    self.category = category
    self.vis_prms = vis_prms
    self.vis_prms["label"] = str(name)
    self.overlapping = overlapping
    self.prms = parameters

Loci

A Loci object holds information about all loci to be plotted and methods for data preparation.

Attributes:

  • loci (list) –

    List of Locus objects.

  • locus_annotation (DataFrame) –

    Table with information about each locus that defines visualisation (e.g. coordinates for visualisation, description, etc).

  • feature_annotation (DataFrame) –

    Table with information about each feature that defines visualisation (e.g. group, name, category, etc).

  • prms (Parameters) –

    Parameters' class object that holds config and cmd arguments.

Source code in lovis4u/DataProcessing.py
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
class Loci:
    """A Loci object holds information about all loci to be plotted and methods for data preparation.

    Attributes:
        loci (list): List of Locus objects.
        locus_annotation (pd.DataFrame): Table with information about each locus that defines visualisation
            (e.g. coordinates for visualisation, description, etc).
        feature_annotation (pd.DataFrame): Table with information about each feature that defines visualisation
            (e.g. group, name, category, etc).
        prms (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.

    """

    def __init__(self, parameters: lovis4u.Manager.Parameters):
        """Create a Loci object.

        Arguments:
            parameters (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.
        """
        self.loci = []
        self.locus_annotation = pd.DataFrame(columns=["sequence_id", "length", "coordinates", "circular", "description",
                                                      "order", "group"]).set_index("sequence_id")
        self.feature_annotation = pd.DataFrame(columns=["feature_id", "locus_id", "coordinates", "feature_type", "name",
                                                        "group", "group_type", "category", "fill_colour",
                                                        "stroke_colour",
                                                        "show_label"]).set_index("feature_id")
        self.prms = parameters

    def __load_annotation_file(self, file_path: str, annotation_columns: list, index_column: str) -> pd.DataFrame:
        """Private method to load an annotation file.

        Arguments:
            file_path (str): File path for an annotation file to be loaded.
            annotation_columns (list): List of columns that should be considered.
            index_column (str): Column name to be considered as index.

        Returns:
              pd.DataFrame: Preprocessed annotation file.
        """
        annotation_table = pd.read_table(file_path)
        found_allowed_columns = [i for i in annotation_columns if i in annotation_table.columns]
        not_found_allowed_columns = [i for i in annotation_columns if i not in annotation_table.columns]
        annotation_table = annotation_table[found_allowed_columns].set_index(index_column)
        annotation_table[not_found_allowed_columns] = None
        return annotation_table

    def load_locus_annotation_file(self, file_path: str) -> None:
        """Load loci annotation file.

        Arguments:
            file_path (str): File path for a loci annotation file to be loaded.

        Returns:
            None

        """
        annotation_columns = ["sequence_id", "length", "coordinates", "circular", "description", "order", "group"]
        self.locus_annotation = self.__load_annotation_file(file_path, annotation_columns, "sequence_id")
        return None

    def load_feature_annotation_file(self, file_path: str) -> None:
        """Load features annotation file.

        Arguments:
            file_path (str): File path for a features annotation file to be loaded.

        Returns:
            None

        """
        annotation_columns = ["feature_id", "locus_id", "coordinates", "feature_type", "name", "group", "group_type",
                              "category", "fill_colour", "stroke_colour", "show_label"]
        self.feature_annotation = self.__load_annotation_file(file_path, annotation_columns, "feature_id")
        return None

    def __update_locus_annotation(self, record_id: str, record_description: str, record_length: int) -> None:
        """Private method for updating loci annotation.

        Arguments:
            record_id (str): Sequence identifier.
            record_description (str): Sequence description.
            record_length (int): Sequence length.

        Returns:
            None

        """
        if record_id not in self.locus_annotation.index:
            self.locus_annotation.loc[record_id] = {col: None for col in self.locus_annotation.columns}
        if self.prms.args["windows"]:
            coordinates = []
            for window in self.prms.args["windows"]:
                subwindows = window.strip().split(",")
                for subwindow in subwindows:
                    subwindow_split = subwindow.strip().split(":")
                    if len(subwindow_split) == 4 and subwindow_split[0] == record_id:
                        coordinate = ":".join(subwindow_split[1:])
                        coordinates.append(coordinate)
            coordinates = ",".join(coordinates)
        else:
            coordinates = f"1:{record_length}:1"
        default_values = dict(length=record_length, coordinates=coordinates,
                              description=record_description, circular=1, order=len(self.loci), group=1)
        self.locus_annotation.loc[record_id] = self.locus_annotation.loc[record_id].fillna(default_values)
        return None

    def __update_feature_annotation(self, feature_id: str, locus_id: str, coordinates: str, feature_type: str,
                                    category: str, name: str) -> None:
        """Private method for updating feature annotation.

        Arguments:
            feature_id (str): Feature identifier.
            locus_id (str): Sequence description.
            coordinates (str): Feature coordinates.
            category (str): Feature type.
            name (str): Feature name.


        Returns:
            None

        """
        if feature_id not in self.feature_annotation.index:
            self.feature_annotation.loc[feature_id] = {col: None for col in self.feature_annotation.columns}

        if self.feature_annotation.loc[feature_id]["group_type"] in self.prms.args["feature_group_types_to_show_label"] \
                or "all" in self.prms.args["feature_group_types_to_show_label"]:
            show_label = 1
        else:
            show_label = 0
        stroke_colour = "default"
        if self.prms.args["set_feature_stroke_colour_based_on_fill_colour"] and \
                self.feature_annotation.loc[feature_id]["fill_colour"] and \
                self.feature_annotation.loc[feature_id]["fill_colour"] != "default":
            stroke_colour = lovis4u.Methods.scale_lightness(self.feature_annotation.loc[feature_id]["fill_colour"],
                                                            self.prms.args["feature_stroke_colour_relative_lightness"])
        default_values = dict(locus_id=locus_id, coordinates=coordinates, feature_type=feature_type,
                              name=name, group="", group_type="undefined", category=category, fill_colour="default",
                              stroke_colour=stroke_colour,
                              show_label=show_label)
        self.feature_annotation.loc[feature_id] = self.feature_annotation.loc[feature_id].fillna(default_values)
        return None

    def load_loci_from_extended_gff(self, input_f: str, ilund4u_mode: bool = False) -> None:
        """Load loci from the folder with gff files. Each GFF file also should contain corresponding nucleotide
            sequence. Such files are produced for example by pharokka annotation tool.

        All files with extension other than .gff (not case-sensitive) will be ignored.

        Arguments:
            input_folder: folder name with gff files.

        Returns:
            None

        """

        try:
            if isinstance(input_f, str):
                input_folder = input_f
                if not os.path.exists(input_folder):
                    raise lovis4u.Manager.lovis4uError(f"Folder/file {input_folder} does not exist.")
                if os.path.isfile(input_f):
                    gff_files = [input_f]
                else:
                    gff_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder)]
            elif isinstance(input_f, list):
                gff_files = input_f
            else:
                raise lovis4u.Manager.lovis4uError(f"The input for the GFF parsing function must be either a folder or "
                                                   f"a list of files.")
            if not gff_files:
                raise lovis4u.Manager.lovis4uError(f"Folder {input_f} does not contain files.")
            if self.prms.args["verbose"]:
                print(f"○ Reading gff file{'s' if len(gff_files) > 1 else ''}...", file=sys.stdout)
            for gff_file in gff_files:
                try:
                    gff_records = list(BCBio.GFF.parse(gff_file, limit_info=dict(gff_type=["CDS", "tRNA", "tmRNA",
                                                                                           "RNA",
                                                                                           "pseudogene", "rRNA",
                                                                                           "misc_RNA",
                                                                                           "ncRNA", "lncRNA"])))
                    if len(gff_records) != 1:
                        print(f"○ Warning: gff file {gff_file} contains information for more than 1 "
                              f"sequence. File will be skipped.")
                        continue
                    gff_record = gff_records[0]
                    try:
                        record_locus_sequence = gff_record.seq
                    except Bio.Seq.UndefinedSequenceError:
                        print(f"○ Warning: gff file {gff_file} doesn't contain corresponding sequences.")
                        continue
                    if self.prms.args["gff_description_source"] in gff_record.annotations:
                        record_description = gff_record.annotations[self.prms.args["gff_description_source"]][0]
                        if isinstance(record_description, tuple):
                            record_description = " ".join(record_description)
                    else:
                        record_description = ""
                    if self.prms.args["use_filename_as_contig_id"]:
                        gff_record.id = os.path.splitext(os.path.basename(gff_file))[0]
                    self.__update_locus_annotation(gff_record.id, record_description, len(record_locus_sequence))
                    locus_annotation_row = self.locus_annotation.loc[gff_record.id]
                    coordinates = [dict(zip(["start", "end", "strand"], map(int, c.split(":")))) for c in
                                   locus_annotation_row["coordinates"].split(",")]
                    record_locus = Locus(seq_id=gff_record.id, coordinates=coordinates,
                                         description=locus_annotation_row["description"],
                                         sequence=Bio.SeqRecord.SeqRecord(seq=record_locus_sequence, id=gff_record.id),
                                         circular=locus_annotation_row["circular"],
                                         length=locus_annotation_row["length"], parameters=self.prms, features=[],
                                         order=locus_annotation_row["order"])
                    features_ids = [i.id for i in gff_record.features]
                    if len(features_ids) != len(set(features_ids)):
                        raise lovis4u.Manager.lovis4uError(f"Gff file {gff_file} contains duplicated feature ids while"
                                                           f" only unique are allowed.")
                    for gff_feature in gff_record.features:
                        feature_id = gff_feature.id
                        if ilund4u_mode or self.prms.args["add_locus_id_prefix"]:
                            if gff_record.id not in feature_id:
                                feature_id = f"{gff_record.id}-{feature_id}"
                        if gff_feature.type == "CDS":
                            transl_table = self.prms.args["default_transl_table"]
                            if "transl_table" in gff_feature.qualifiers.keys():
                                transl_table = int(gff_feature.qualifiers["transl_table"][0])
                            name = ""
                            if self.prms.args["gff_CDS_name_source"] in gff_feature.qualifiers:
                                name = gff_feature.qualifiers[self.prms.args["gff_CDS_name_source"]][0]
                            category = ""
                            if self.prms.args["gff_CDS_category_source"] in gff_feature.qualifiers:
                                category = ",".join(gff_feature.qualifiers[self.prms.args["gff_CDS_category_source"]])
                            sequence = gff_feature.translate(record_locus_sequence, table=transl_table,
                                                             cds=False)[:-1]
                        else:
                            name, category = "", ""
                            if self.prms.args["gff_noncoding_name_source"] in gff_feature.qualifiers:
                                name = self.prms.args["gff_noncoding_name_source"][0]
                            else:
                                for ans in self.prms.args["gff_noncoding_name_alternative_source"]:
                                    if ans in gff_feature.qualifiers:
                                        name = gff_feature.qualifiers[ans][0]
                                        break
                            if not name:
                                name = feature_id
                            category = gff_feature.type
                            sequence = gff_feature.extract(record_locus_sequence)
                        for coordinate in record_locus.coordinates:
                            overlapping = False
                            start, end = coordinate["start"], coordinate["end"]
                            if start <= gff_feature.location.start + 1 <= end or start <= gff_feature.location.end <= end:
                                overlapping = True
                                break
                        if not overlapping and not self.prms.args["cluster_all_proteins"]:
                            continue
                        self.__update_feature_annotation(feature_id, record_locus.seq_id,
                                                         f"{int(gff_feature.location.start) + 1}:"
                                                         f"{int(gff_feature.location.end)}:{gff_feature.location.strand}",
                                                         gff_feature.type, category, name)
                        feature_annotation_row = self.feature_annotation.loc[feature_id]
                        feature = Feature(feature_type=feature_annotation_row["feature_type"],
                                          feature_id=feature_id, start=int(gff_feature.location.start) + 1,
                                          end=int(gff_feature.location.end), strand=gff_feature.location.strand,
                                          name=feature_annotation_row["name"],
                                          sequence=sequence,
                                          group=feature_annotation_row["group"],
                                          group_type=feature_annotation_row["group_type"],
                                          category=feature_annotation_row["category"],
                                          vis_prms=dict(fill_colour=feature_annotation_row["fill_colour"],
                                                        stroke_colour=feature_annotation_row["stroke_colour"],
                                                        show_label=feature_annotation_row["show_label"],
                                                        hmmscan_hit=0),
                                          overlapping=overlapping, parameters=self.prms)
                        record_locus.features.append(feature)
                    self.loci.append(record_locus)
                except:
                    print(f"○ Warning: gff file {gff_file} was not read properly and skipped")
                    if self.prms.args["parsing_debug"]:
                        self.prms.args["debug"] = True
                        raise lovis4u.Manager.lovis4uError()
            seq_id_to_order = self.locus_annotation["order"].to_dict()
            loci_ids = [l.seq_id for l in self.loci]
            if len(loci_ids) != len(set(loci_ids)):
                raise lovis4u.Manager.lovis4uError(f"The input gff files have duplicated contig ids. "
                                                   f"You can use `--use-filename-as-id` parameter to use file name "
                                                   f"as contig id which can help to fix the problem.")
            feature_ids = [feature.feature_id for locus in self.loci for feature in locus.features]
            if len(feature_ids) != len(set(feature_ids)):
                raise lovis4u.Manager.lovis4uError(f"The input gff files have duplicated features ids. "
                                                   f"You can use `--add-locus-id-prefix` (`-alip`) parameter to add "
                                                   f"contig id prefix to each feature id which can help to "
                                                   f"fix the problem.")
            self.loci.sort(key=lambda locus: seq_id_to_order[locus.seq_id])
            if self.prms.args["verbose"]:
                print(f"⦿ {len(self.loci)} {'locus was' if len(self.loci) == 1 else 'loci were'} loaded from the gff "
                      f"files folder", file=sys.stdout)
            return None
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to load loci from gff folder.") from error

    def load_loci_from_gb(self, input_folder: str) -> None:
        """Load loci from the folder with genbank files.

        All files with extension other than .gb (not case-sensitive) will be ignored.

        Arguments:
            input_folder: folder name with gb files.

        Returns:
            None

        """
        if not os.path.exists(input_folder):
            raise lovis4u.Manager.lovis4uError(f"Folder/file {input_folder} does not exist.")
        try:
            if os.path.isfile(input_folder):
                gb_files = [input_folder]
            else:
                gb_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder)]
            if not gb_files:
                raise lovis4u.Manager.lovis4uError(f"Folder {input_folder} does not contain files.")
            if self.prms.args["verbose"]:
                print(f"○ Reading gb file{'s' if len(gb_files) > 1 else ''}...", file=sys.stdout)
            for gb_file in gb_files:
                try:
                    gb_records = list(Bio.SeqIO.parse(gb_file, "genbank"))
                    if len(gb_records) != 1:
                        print(f"○ Warning: gb file {gb_file} contains information for more than 1 "
                              f"sequence. File will be skipped.")
                        continue
                    gb_record = gb_records[0]
                    record_locus_sequence = gb_record.seq
                    if self.prms.args["genbank_description_source"] == "description":
                        record_description = gb_record.description
                    elif "annotations:" in self.prms.args["genbank_description_source"]:
                        feature_description_key = self.prms.args["genbank_description_source"].split(":")[1]
                        record_description = gb_record.annotations[feature_description_key]
                    else:
                        record_description = ""
                    if self.prms.args["use_filename_as_contig_id"]:
                        gb_record.id = os.path.splitext(os.path.basename(gb_file))[0]
                    self.__update_locus_annotation(gb_record.id, record_description, len(record_locus_sequence))
                    locus_annotation_row = self.locus_annotation.loc[gb_record.id]
                    coordinates = [dict(zip(["start", "end", "strand"], map(int, c.split(":")))) for c in
                                   locus_annotation_row["coordinates"].split(",")]
                    record_locus = Locus(seq_id=gb_record.id, coordinates=coordinates,
                                         description=locus_annotation_row["description"],
                                         sequence=Bio.SeqRecord.SeqRecord(seq=record_locus_sequence, id=gb_record.id),
                                         circular=locus_annotation_row["circular"],
                                         length=locus_annotation_row["length"], parameters=self.prms, features=[],
                                         order=locus_annotation_row["order"])

                    gb_CDSs = [i for i in gb_record.features if i.type == "CDS"]
                    first_CDS_record = gb_CDSs[0]
                    id_source = self.prms.args["genbank_id_source"]
                    if self.prms.args["genbank_id_source"] not in first_CDS_record.qualifiers:
                        for alternative_id_source in self.prms.args["genbank_id_alternative_source"]:
                            if alternative_id_source in first_CDS_record.qualifiers:
                                id_source = alternative_id_source
                                if self.prms.args["verbose"]:
                                    print(f"○ Warning: there is no <{self.prms.args['genbank_id_source']}> attribute "
                                          f"for CDS records in {gb_file}. Alternative <{id_source}> was used instead.",
                                          file=sys.stdout)
                                break
                        if id_source == self.prms.args["genbank_id_source"]:
                            print(f"There is no <{self.prms.args['genbank_id_source']}> "
                                  f"attribute for CDS record found in {gb_file}. We tried to"
                                  f" find any from the alternative list: "
                                  f"{','.join(self.prms.args['genbank_id_alternative_source'])}"
                                  f", but they also weren't found.")  # add about cmd parameter
                    features_ids = [i.qualifiers[id_source][0] for i in gb_CDSs if id_source in i.qualifiers]
                    if len(features_ids) != len(set(features_ids)):
                        print(f"GB file {gb_record} contains duplicated feature ids while"
                              f" only unique are allowed.")
                    for gb_feature in gb_record.features:
                        if id_source not in gb_feature.qualifiers:
                            print(f"    ○ Warning: genbank feature for {gb_file} located at {gb_feature.location} "
                                  f"was skipped\n    since it does not have id qualifier {id_source} found for other "
                                  f"features.\n    it could be a case of zero length ORF. ", file=sys.stdout)
                            continue
                        feature_id = gb_feature.qualifiers[id_source][0].replace("|", "_")
                        if self.prms.args["add_locus_id_prefix"]:
                            if gb_record.id not in feature_id:
                                feature_id = f"{gb_record.id}-{feature_id}"
                        transl_table = self.prms.args["default_transl_table"]
                        if "transl_table" in gb_feature.qualifiers.keys():
                            transl_table = int(gb_feature.qualifiers["transl_table"][0])
                        name = ""
                        if gb_feature.type == "CDS":
                            if self.prms.args["genbank_CDS_name_source"] in gb_feature.qualifiers:
                                name = gb_feature.qualifiers[self.prms.args["genbank_CDS_name_source"]][0]
                            category = ""
                            if self.prms.args["genbank_CDS_category_source"] in gb_feature.qualifiers:
                                category = ",".join(
                                    gb_feature.qualifiers[self.prms.args["genbank_CDS_category_source"]])
                            sequence = gb_feature.translate(record_locus_sequence, table=transl_table, cds=False)[:-1]
                        else:
                            name, category = "", ""
                            if self.prms.args["gff_noncoding_name_source"] in gb_feature.qualifiers:
                                name = self.prms.args["gff_noncoding_name_source"][0]
                            else:
                                for ans in self.prms.args["gff_noncoding_name_alternative_source"]:
                                    if ans in gb_feature.qualifiers:
                                        name = gb_feature.qualifiers[ans][0]
                                        break
                            if not name:
                                name = feature_id
                            category = gb_feature.type
                            sequence = gb_feature.extract(record_locus_sequence)
                        for coordinate in record_locus.coordinates:
                            overlapping = False
                            start, end = coordinate["start"], coordinate["end"]
                            if start <= gb_feature.location.start + 1 <= end or start <= gb_feature.location.end <= end:
                                overlapping = True
                                break
                        if not overlapping and not self.prms.args["cluster_all_proteins"]:
                            continue
                        self.__update_feature_annotation(feature_id, record_locus.seq_id,
                                                         f"{int(gb_feature.location.start) + 1}:"
                                                         f"{int(gb_feature.location.end)}:"
                                                         f"{gb_feature.location.strand}", gb_feature.type, category,
                                                         name)
                        feature_annotation_row = self.feature_annotation.loc[feature_id]
                        feature = Feature(feature_type=feature_annotation_row["feature_type"],
                                          feature_id=feature_id, start=int(gb_feature.location.start) + 1,
                                          end=int(gb_feature.location.end),
                                          strand=gb_feature.location.strand,
                                          name=feature_annotation_row["name"],
                                          sequence=sequence,
                                          group=feature_annotation_row["group"],
                                          group_type=feature_annotation_row["group_type"],
                                          category=feature_annotation_row["category"],
                                          vis_prms=dict(fill_colour=feature_annotation_row["fill_colour"],
                                                        stroke_colour=feature_annotation_row["stroke_colour"],
                                                        show_label=feature_annotation_row["show_label"],
                                                        hmmscan_hit=0),
                                          overlapping=overlapping,
                                          parameters=self.prms)

                        record_locus.features.append(feature)
                    self.loci.append(record_locus)
                except:
                    print(f"○ Warning: gb file {gb_file} was not read properly and skipped")
                    if self.prms.args["parsing_debug"]:
                        self.prms.args["debug"] = True
                        raise lovis4u.Manager.lovis4uError()
            seq_id_to_order = self.locus_annotation["order"].to_dict()
            loci_ids = [l.seq_id for l in self.loci]
            if len(loci_ids) != len(set(loci_ids)):
                raise lovis4u.Manager.lovis4uError(f"The input gb files have duplicated contig ids. "
                                                   f"You can use `--use-filename-as-id` parameter to use file name "
                                                   f"as contig id which can help to fix the problem.")
            feature_ids = [feature.feature_id for locus in self.loci for feature in locus.features]
            if len(feature_ids) != len(set(feature_ids)):
                raise lovis4u.Manager.lovis4uError(f"The input gb files have duplicated features ids. "
                                                   f"You can use `--add-locus-id-prefix` (`-alip`) parameter to add "
                                                   f"contig id prefix to each feature id which can help to "
                                                   f"fix the problem.")
            self.loci.sort(key=lambda locus: seq_id_to_order[locus.seq_id])
            if self.prms.args["verbose"]:
                print(f"⦿ {len(self.loci)} {'locus was' if len(self.loci) == 1 else 'loci were'} loaded from the "
                      f"genbank files folder", file=sys.stdout)
            return None
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to load loci from gb folder.") from error

    def save_locus_annotation_table(self) -> None:
        """Save loci annotation table to the output folder.

        Output file name is locus_annotation_table.tsv

        Returns:
            None

        """
        try:
            if not os.path.exists(self.prms.args["output_dir"]):
                os.mkdir(self.prms.args["output_dir"])
            locus_ids = [locus.seq_id for locus in self.loci]
            self.locus_annotation = self.locus_annotation.loc[self.locus_annotation.index.isin(locus_ids)]
            file_path = os.path.join(self.prms.args["output_dir"], "locus_annotation_table.tsv")
            self.locus_annotation.to_csv(file_path, sep="\t", index_label="sequence_id")
            if self.prms.args["verbose"]:
                print(f"⦿ Loci annotation table was saved to {file_path}", file=sys.stdout)
            return None
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to save loci annotation table.") from error

    def save_feature_annotation_table(self) -> None:
        """Save feature annotation table to the output folder.

        Output file name is feature_annotation_table.tsv

        Returns:
            None

        """
        try:
            if not os.path.exists(self.prms.args["output_dir"]):
                os.mkdir(self.prms.args["output_dir"])
            feature_ids = [feature.feature_id for locus in self.loci for feature in locus.features]
            self.feature_annotation = self.feature_annotation.loc[self.feature_annotation.index.isin(feature_ids)]
            file_path = os.path.join(self.prms.args["output_dir"], "feature_annotation_table.tsv")
            self.feature_annotation.to_csv(file_path, sep="\t", index_label="feature_id")
            if self.prms.args["verbose"]:
                print(f"⦿ Feature annotation table was saved to {file_path}", file=sys.stdout)
            return None
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to save feature annotation table.") from error

    def mmseqs_cluster(self) -> pd.DataFrame:
        """Cluster all proteins using mmseqs in order to define groups of homologues.

        Returns:
            pd.DataFrame: parsed mmseqs table (pandas dataframe) with columns: cluster, protein_id; where cluster is
                defined by representative sequence id within a corresponding cluster.

        """
        if self.prms.args["verbose"]:
            print(f"○ Running mmseqs for protein clustering...", file=sys.stdout)
        try:
            feature_records = [feature.record for locus in self.loci for feature in locus.features if
                               feature.feature_type == "CDS"]
            temp_input = tempfile.NamedTemporaryFile()
            Bio.SeqIO.write(feature_records, temp_input.name, "fasta")
            if not os.path.exists(self.prms.args["output_dir"]):
                os.mkdir(self.prms.args["output_dir"])
            mmseqs_output_folder = os.path.join(self.prms.args["output_dir"], "mmseqs")
            if os.path.exists(mmseqs_output_folder):
                shutil.rmtree(mmseqs_output_folder)
            os.mkdir(mmseqs_output_folder)
            Bio.SeqIO.write(feature_records, os.path.join(mmseqs_output_folder, "input_proteins.fa"), "fasta")
            mmseqs_output_folder_db = os.path.join(mmseqs_output_folder, "DB")
            os.mkdir(mmseqs_output_folder_db)
            mmseqs_stdout = open(os.path.join(mmseqs_output_folder, "mmseqs_stdout.txt"), "w")
            mmseqs_stderr = open(os.path.join(mmseqs_output_folder, "mmseqs_stderr.txt"), "w")
            subprocess.run([self.prms.args["mmseqs_binary"], "createdb", temp_input.name,
                            os.path.join(mmseqs_output_folder_db, "sequencesDB")], stdout=mmseqs_stdout,
                           stderr=mmseqs_stderr)
            subprocess.run([self.prms.args["mmseqs_binary"], "cluster",
                            os.path.join(mmseqs_output_folder_db, "sequencesDB"),
                            os.path.join(mmseqs_output_folder_db, "clusterDB"),
                            os.path.join(mmseqs_output_folder_db, "tmp"),
                            "--cluster-mode", str(self.prms.args["mmseqs_cluster_mode"]),
                            "--cov-mode", str(self.prms.args["mmseqs_cov_mode"]),
                            "--min-seq-id", str(self.prms.args["mmseqs_min_seq_id"]),
                            "-c", str(self.prms.args["mmseqs_c"]),
                            "-s", str(self.prms.args["mmseqs_s"])], stdout=mmseqs_stdout, stderr=mmseqs_stderr)
            subprocess.run([self.prms.args["mmseqs_binary"], "createtsv",
                            os.path.join(mmseqs_output_folder_db, "sequencesDB"),
                            os.path.join(mmseqs_output_folder_db, "sequencesDB"),
                            os.path.join(mmseqs_output_folder_db, "clusterDB"),
                            os.path.join(mmseqs_output_folder, "mmseqs_clustering.tsv")],
                           stdout=mmseqs_stdout, stderr=mmseqs_stderr)
            mmseqs_clustering_results = pd.read_table(os.path.join(mmseqs_output_folder, "mmseqs_clustering.tsv"),
                                                      sep="\t", header=None, names=["cluster", "protein_id"])
            mmseqs_clustering_results = mmseqs_clustering_results.set_index("protein_id")

            num_of_unique_clusters = len(set(mmseqs_clustering_results["cluster"].to_list()))
            num_of_proteins = len(mmseqs_clustering_results.index)
            if self.prms.args["verbose"]:
                print(f"⦿ {num_of_unique_clusters} clusters for {num_of_proteins} proteins were found with mmseqs\n"
                      f"\tmmseqs clustering results were saved to "
                      f"{os.path.join(mmseqs_output_folder, 'mmseqs_clustering.tsv')}", file=sys.stdout)
            return mmseqs_clustering_results
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to run mmseqs clustering. In case you use a linux machine"
                                               ",  have you run a post-install 'lovis4u --linux` command to switch to"
                                               " the linux mmseqs binary?") from error

    def define_feature_groups(self, dataframe: pd.DataFrame, group_column_name: str = "cluster") -> None:
        """Set features attribute "group" based on input dataframe.

        By default it is designed to use mmseqs_cluster() function results as input. If you already have precomputed
            feature groups you can set them with feature table.

        Arguments:
            dataframe (pd.DataFrame): dataframe with feature id - group pairs. Its index column should
                represent all loci CDS features.
            group_column_name (str): column name of the dataframe that represent corresponding group to each feature.

        Returns:
            None

        """
        try:

            for locus in self.loci:
                for feature in locus.features:
                    if feature.group and self.prms.args["keep_predefined_groups"]:
                        continue
                    if feature.feature_type == "CDS":
                        feature.group = dataframe.loc[feature.feature_id, group_column_name]
                    else:
                        feature.group = feature.name
                    self.feature_annotation.loc[feature.feature_id, "group"] = feature.group
            return None
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to define protein features groups.") from error

    def remove_non_overlapping_features(self) -> None:
        """Removes features that are not overlapping with visualisation window.

        Returns:
            None
        """
        try:
            ids_of_non_overlapping_objects = []
            for locus in self.loci:
                ids_of_non_overlapping_objects += [obj.feature_id for obj in locus.features if not obj.overlapping]
                filtered_objects = [obj for obj in locus.features if obj.overlapping]
                locus.features = filtered_objects
            if ids_of_non_overlapping_objects:
                print("○ Warning message: LoVis4u clusters all proteins by default to define their classes"
                      "\n\t('variable' or 'conserved'), including those outside the visualisation window."
                      "\n\tTo cluster only proteins within the visualised area, use the -cl-owp parameter.")
            self.feature_annotation = self.feature_annotation.drop(ids_of_non_overlapping_objects)
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to clean non overlapping features.") from error

    def define_labels_to_be_shown(self):
        """Set feature visaulisation attribute "show_label" based on feature groups.

        controlled by feature_labels_to_ignore, feature_group_types_to_show_label, and
            feature_group_types_to_show_label_on_first_occurrence parameters.

        Returns:
            None

        """
        try:
            added_first_occurrence_labels = []
            for locus in self.loci:
                for feature in locus.features:
                    if self.prms.args["show_all_feature_labels"]:
                        feature.vis_prms["show_label"] = 1
                        continue
                    if feature.vis_prms["label"] not in self.prms.args["feature_labels_to_ignore"]:
                        if "any" in self.prms.args["feature_group_types_to_show_label"]:
                            feature.vis_prms["show_label"] = 1
                        elif feature.group_type in self.prms.args["feature_group_types_to_show_label"] or \
                                (feature.vis_prms["hmmscan_hit"] and self.prms.args["show_all_label_for_query_"
                                                                                    "proteins"]):
                            feature.vis_prms["show_label"] = 1
                        elif (feature.group_type in \
                              self.prms.args["feature_group_types_to_show_label_on_first_occurrence"]) or \
                                (feature.vis_prms["hmmscan_hit"] and self.prms.args["show_label_on_first_occurrence"
                                                                                    "_for_query_proteins"]):
                            if feature.group not in added_first_occurrence_labels or len(self.loci) == 1:
                                feature.vis_prms["show_label"] = 1
                                added_first_occurrence_labels.append(feature.group)
                    else:
                        feature.vis_prms["show_label"] = 0
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable define feature labels to be shown.") from error

    def cluster_sequences(self, dataframe: pd.DataFrame, one_cluster: bool) -> None:
        """Define loci order and clusters with proteome similarity based hierarchical clustering.
            This function changes the order of loci that are plotted and also updates corresponding to each loci group
            attribute which defines homologues groups of proteomes.

        It's designed to use as input mmseqs_cluster() function results. However, if you have obtained homologues
            groups by other method you can also build pandas dataframe based on that with index corresponding to
            feature id and column "cluster" corresponding to the group.

        Arguments:
              dataframe (pd.DataFrame): dataframe with feature id - group pairs. Its index column should
                represent all loci CDS features.
                one_cluster (bool): consider all sequences to be members of one cluster, but still define the
                optimal order.

        Returns:
            None

        """
        try:
            proteins_loci_dict = collections.defaultdict(collections.deque)
            loci_clusters_dict = dict()
            number_of_loci = len(self.loci)
            if number_of_loci < 2:
                return None
            proteome_sizes = pd.Series(np.zeros(number_of_loci, dtype=int))
            for locus_index in range(number_of_loci):
                locus = self.loci[locus_index]
                loci_clusters = [dataframe.loc[feature.feature_id, "cluster"] for feature in locus.features if
                                 feature.feature_type == "CDS"]
                loci_clusters_dict[locus_index] = list(set(loci_clusters))
                proteome_sizes.iloc[locus_index] = len(set(loci_clusters))
                for l_cl in loci_clusters:
                    proteins_loci_dict[l_cl].append(locus_index)

            loci_ids = [locus.seq_id for locus in self.loci]
            similarity_matrix = pd.DataFrame(0.0, index=loci_ids, columns=loci_ids)
            for locus_index in range(number_of_loci):
                counts = pd.Series(np.zeros(number_of_loci, dtype=int))
                for cluster in loci_clusters_dict[locus_index]:
                    js = proteins_loci_dict[cluster]
                    counts.iloc[js] += 1
                locus_size = proteome_sizes[locus_index]
                norm_factors = pd.Series(0.5 * (locus_size + proteome_sizes) / (locus_size * proteome_sizes),
                                         index=counts.index)
                weights = counts.mul(norm_factors)
                similarity_matrix.iloc[locus_index] = weights
            symmetric_distance_matrix = 1 - similarity_matrix
            np.fill_diagonal(symmetric_distance_matrix.values, 0)
            linkage_matrix = scipy.cluster.hierarchy.linkage(
                scipy.spatial.distance.squareform(symmetric_distance_matrix),
                method="average")
            dendrogram = scipy.cluster.hierarchy.dendrogram(linkage_matrix, no_plot=True)
            if not one_cluster:
                clusters = pd.Series(scipy.cluster.hierarchy.fcluster(linkage_matrix,
                                                                      self.prms.args["clustering_h_value"],
                                                                      criterion="distance"),
                                     index=loci_ids)
                for locus in self.loci:
                    locus.group = clusters[locus.seq_id]
                    self.locus_annotation.loc[locus.seq_id, "group"] = locus.group
            order = dendrogram["leaves"][::-1]
            self.locus_annotation["initial_order"] = self.locus_annotation["order"]
            for locus_index in range(number_of_loci):
                locus = self.loci[locus_index]
                self.locus_annotation.loc[locus.seq_id, "order"] = order.index(locus_index)
            self.locus_annotation.sort_values(by="order", inplace=True)
            seq_id_to_order = self.locus_annotation["order"].to_dict()
            self.loci.sort(key=lambda locus: seq_id_to_order[locus.seq_id])

            reordered_similarity_matrix = similarity_matrix.reindex(index=self.locus_annotation.index,
                                                                    columns=self.locus_annotation.index)
            if not os.path.exists(self.prms.args["output_dir"]):
                os.mkdir(self.prms.args["output_dir"])
            file_path = os.path.join(self.prms.args["output_dir"], "proteome_similarity_matrix.tsv")
            reordered_similarity_matrix.to_csv(file_path, sep="\t")
            num_of_loci_groups = len(set(self.locus_annotation["group"].to_list()))
            if self.prms.args["verbose"]:
                if num_of_loci_groups == 1:
                    print(f"⦿ Loci order and {num_of_loci_groups} cluster was defined with proteome similarity based "
                          f"hierarchical clustering", file=sys.stdout)
                elif num_of_loci_groups > 1:
                    print(f"⦿ Loci order and {num_of_loci_groups} clusters were defined with proteome similarity based "
                          f"hierarchical clustering", file=sys.stdout)
                print(f"⦿ Proteome similarity matrix of loci was saved to {file_path}", file=sys.stdout)
            return None
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to cluster loci sequences.") from error

    def find_variable_feature_groups(self, mmseqs_results: pd.DataFrame) -> None:
        """Define feature group type attributes (variable or conserved) based on their conservation in corresponding
            loci group feature.

        It's designed to use as input mmseqs_cluster() function results. However, if you have obtained homologues
            groups by other method you can also build pandas dataframe based on that with index corresponding to
            feature id and column "cluster" corresponding to the group.

        Arguments:
              mmseqs_results (pd.DataFrame): dataframe with feature id - group pairs. Its index column should
                represent all loci CDS features.

        Returns:
            None

        """
        try:
            loci_clusters_sizes = self.locus_annotation["group"].value_counts()
            loci_clusters_cutoff_v = np.round(self.prms.args["CDS_is_variable_cutoff"] * loci_clusters_sizes).astype(
                int)
            loci_clusters_cutoff_c = np.round(self.prms.args["CDF_is_conserved_cutoff"] * loci_clusters_sizes).astype(
                int)
            loci_clusters_cutoff_v[loci_clusters_cutoff_v == 0] = 1
            cluster_types = collections.defaultdict(dict)
            for cluster in set(mmseqs_results["cluster"].to_list()):
                cluster_proteins = mmseqs_results[mmseqs_results["cluster"] == cluster].index
                cluster_loci = [locus for locus in self.loci if
                                any(feature.feature_id in cluster_proteins for feature in locus.features)]
                cluster_loci_groups = [locus.group for locus in cluster_loci]
                for cluster_locus_group in cluster_loci_groups:
                    current_group_cluster_loci = [locus.seq_id for locus in cluster_loci if
                                                  locus.group == cluster_locus_group]
                    current_group_cluster_size = len(set(current_group_cluster_loci))
                    if loci_clusters_sizes[cluster_locus_group] > 1 and \
                            current_group_cluster_size <= loci_clusters_cutoff_v[cluster_locus_group]:
                        cluster_types[cluster_locus_group][cluster] = "variable"
                    elif loci_clusters_sizes[cluster_locus_group] > 1 and \
                            (loci_clusters_cutoff_v[cluster_locus_group] < current_group_cluster_size <
                             loci_clusters_cutoff_c[cluster_locus_group]):
                        cluster_types[cluster_locus_group][cluster] = "intermediate"
                    else:
                        cluster_types[cluster_locus_group][cluster] = "conserved"
            for locus in self.loci:
                locus_group = locus.group
                for feature in locus.features:
                    if feature.feature_type == "CDS":
                        if feature.group_type and self.prms.args["keep_predefined_groups"]:
                            continue
                        feature.group_type = cluster_types[locus_group][feature.group]
                    else:
                        feature.group_type = "noncoding"
                    self.feature_annotation.loc[feature.feature_id, "group_type"] = feature.group_type
            return None
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to define variable feature groups.") from error

    def set_feature_colours_based_on_groups(self) -> None:
        """Define features fill colour based on corresponding feature group and group types.

        Returns:
            None

        """
        try:
            feature_groups = set([feature.group for locus in self.loci for feature in locus.features
                                  if (feature.group and feature.feature_type == "CDS")])
            if self.prms.args["feature_group_types_to_set_colour"] and \
                    "all" not in self.prms.args["feature_group_types_to_set_colour"]:
                feature_groups = set([feature.group for locus in self.loci for feature in locus.features
                                      if feature.group and feature.feature_type == "CDS" and feature.group_type in
                                      self.prms.args["feature_group_types_to_set_colour"]])
            number_of_unique_feature_groups = len(feature_groups)
            if self.prms.args["groups_fill_colour_palette_lib"] == "seaborn":
                colours_rgb = seaborn.color_palette(self.prms.args["groups_fill_colour_seaborn_palette"],
                                                    number_of_unique_feature_groups,
                                                    desat=self.prms.args["groups_fill_colour_seaborn_desat"])
                random.shuffle(colours_rgb)
            elif self.prms.args["groups_fill_colour_palette_lib"] == "distinctipy":
                colours_rgb = distinctipy.get_colors(number_of_unique_feature_groups,
                                                     exclude_colours=[(1, 1, 1), (0, 0, 0)],
                                                     pastel_factor=self.prms.args["groups_fill_colours_pastel_factor"])
            colours = list(map(lambda x: matplotlib.colors.rgb2hex(x), colours_rgb))
            colours_dict = {g: c for g, c in zip(list(feature_groups), colours)}
            for locus in self.loci:
                for feature in locus.features:
                    if feature.group in feature_groups and feature.feature_type == "CDS":
                        if self.prms.args["keep_predefined_colours"] and feature.vis_prms["fill_colour"] != "default":
                            continue
                        feature.vis_prms["fill_colour"] = colours_dict[feature.group]
                        self.feature_annotation.loc[feature.feature_id, "fill_colour"] = feature.vis_prms["fill_colour"]
            return None
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to set feature colours based on groups.") from error

    def set_category_colours(self, use_table: bool = True) -> None:
        """Define colours for each category.

        Arguments:
            use_table (bool): Bool value whether table with predefined colours should be used or not.

        Returns:
            None

        """
        try:
            colours_dict = dict()
            if use_table:
                colours_dict.update(
                    pd.read_table(self.prms.args["category_colours"]).set_index("category")["colour"].to_dict())

            feature_categories = list(set([feature.category for locus in self.loci for feature in locus.features
                                           if feature.category and feature.category]))
            if not feature_categories:
                if self.prms.args["verbose"]:
                    print("○ Warning: there are no feature categories to set colours", file=sys.stdout)
            colours_dict = {cat: col for cat, col in colours_dict.items() if cat in feature_categories}

            feature_categories = [ff for ff in feature_categories if ff not in colours_dict.keys()]
            number_of_unique_feature_functions = len(feature_categories)
            colours_rgb = seaborn.color_palette(self.prms.args["category_colour_seaborn_palette"],
                                                number_of_unique_feature_functions,
                                                desat=self.prms.args["category_colour_seaborn_desat"])
            random.shuffle(colours_rgb)
            colours = list(map(lambda x: matplotlib.colors.rgb2hex(x), colours_rgb))
            colours_dict.update({g: c for g, c in zip(list(feature_categories), colours)})
            for locus in self.loci:
                locus.category_colours = colours_dict
            return None
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to set category colours.") from error

    def pyhmmer_annotation(self) -> None:
        """Run pyhhmmer hmmscan against a set of databases for additional annotation of hotspot proteins.

        Arguments:
            proteomes (Proteomes): Proteomes object.

        Returns:
            None

        """
        try:
            if self.prms.args["verbose"]:
                print(f"○ Preparing data for additional protein annotation with pyhmmer hmmscan...",
                      file=sys.stdout)

            cds_records = [feature.record for locus in self.loci for feature in locus.features if
                           feature.feature_type == "CDS"]
            hmmscan_output_folder = os.path.join(self.prms.args["output_dir"], "hmmscan")
            if os.path.exists(hmmscan_output_folder):
                shutil.rmtree(hmmscan_output_folder)
            os.mkdir(hmmscan_output_folder)
            hmmscan_input_fasta_file_path = os.path.join(hmmscan_output_folder, "input_proteins.fa")
            Bio.SeqIO.write(cds_records, hmmscan_input_fasta_file_path, "fasta")

            alignment_table = lovis4u.Methods.run_pyhmmer(hmmscan_input_fasta_file_path, len(cds_records), self.prms)
            if not alignment_table.empty:
                found_hits_for = alignment_table.index.to_list()
                for locus in self.loci:
                    for feature in locus.features:
                        if feature.feature_id in found_hits_for:
                            alignment_table_row = alignment_table.loc[feature.feature_id]
                            if self.prms.args["update_category_with_database_name"]:
                                feature.category = alignment_table_row["target_db"]
                            if self.prms.args["update_protein_name_with_target_name"]:
                                feature.name = alignment_table_row["target"]
                                feature.vis_prms["label"] = feature.name
                            feature.vis_prms["hmmscan_hit"] = 1
            return None
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to run pyhmmer hmmscan annotation.") from error

    def reorient_loci(self, ilund4u_mode: bool = False) -> None:
        """Auto re-orient loci (reset strands) of loci if they are not matched.

        Function tries to maximise co-orientation of homologous features.

        Returns:
            None

        """
        try:
            count_of_changed_strands = 0
            loci = [locus for locus in self.loci]
            for locus_index in range(1, len(loci)):
                p_locus = loci[locus_index - 1]
                c_locus = loci[locus_index]
                p_locus_strands = list(set([c["strand"] for c in p_locus.coordinates]))
                c_locus_strands = list(set([c["strand"] for c in c_locus.coordinates]))
                if len(p_locus_strands) == 1 and len(c_locus_strands) == 1:
                    if not ilund4u_mode:
                        pr_locus_features_groups = set([f.group for f in p_locus.features])
                        c_locus_features_groups = set([f.group for f in c_locus.features])
                    else:
                        pr_locus_features_groups = set(
                            [f.group for f in p_locus.features if f.group_type == "conserved"])
                        c_locus_features_groups = set(
                            [f.group for f in c_locus.features if f.group_type == "conserved"])
                    overlapped_f_groups = pr_locus_features_groups & c_locus_features_groups
                    prl_strand, cl_strand = p_locus_strands[0], c_locus_strands[0]
                    pr_locus_features_strands = {f.group: f.strand * prl_strand for f in p_locus.features if
                                                 f.group in overlapped_f_groups}
                    c_locus_features_strands = {f.group: f.strand * cl_strand for f in c_locus.features if
                                                f.group in overlapped_f_groups}
                    codirection_score = 0

                    for ovg in overlapped_f_groups:
                        codirection_score += pr_locus_features_strands[ovg] * c_locus_features_strands[ovg]
                    if codirection_score < 0:
                        count_of_changed_strands += 1
                        annot_coordinates = []
                        for cc in loci[locus_index].coordinates:
                            cc["strand"] *= -1
                            annot_coordinates.append(f"{cc['start']}:{cc['end']}:{cc['strand']}")
                        loci[locus_index].coordinates = loci[locus_index].coordinates[::-1]
                        annot_coordinates = annot_coordinates[::-1]
                        self.locus_annotation.loc[c_locus.seq_id, "coordinates"] = ",".join(annot_coordinates)
                else:
                    if self.prms.args["verbose"]:
                        print("○ Warning: loci reorientation cannot be applied for loci that have both strands in"
                              " pre-defined coordinates for visualisation")
            if self.prms.args["verbose"]:
                if count_of_changed_strands == 0:
                    print(f"⦿ Orientation was not changed for any locus", file=sys.stdout)
                elif count_of_changed_strands == 1:
                    print(f"⦿ Orientation was changed for 1 locus", file=sys.stdout)
                elif count_of_changed_strands > 1:
                    print(f"⦿ Orientation was changed for {count_of_changed_strands} loci", file=sys.stdout)

            return None
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to define variable feature groups.") from error

    def get_loci_lengths_and_n_of_regions(self) -> list[list[int]]:
        """Get loci lengths and number of regions.

        Returns:
            list: list each element of each contains locus size and number of breaks for visualisation track.

        """
        try:
            loci_sizes = []
            for locus in self.loci:
                number_of_gaps = len(locus.coordinates) - 1
                if locus.circular:
                    for i in range(number_of_gaps):
                        if locus.coordinates[i]["end"] == locus.length and locus.coordinates[i + 1]["start"] == 1:
                            number_of_gaps -= 1
                loci_sizes.append([locus.size, number_of_gaps])
            return loci_sizes
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to get loci lengths.") from error

    def get_contig_sizes(self):
        """Helper method to get a dictionary with contig sizes for bedgraph file parsing

        Returns:
            dict: dictionary with contig sizes
        """
        try:
            contig_sizes = dict()
            for index, row in self.locus_annotation.iterrows():
                contig_sizes[index] = row["length"]
            return contig_sizes
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to get contig sizes dict.") from error

__init__(parameters)

Create a Loci object.

Parameters:

  • parameters (Parameters) –

    Parameters' class object that holds config and cmd arguments.

Source code in lovis4u/DataProcessing.py
def __init__(self, parameters: lovis4u.Manager.Parameters):
    """Create a Loci object.

    Arguments:
        parameters (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.
    """
    self.loci = []
    self.locus_annotation = pd.DataFrame(columns=["sequence_id", "length", "coordinates", "circular", "description",
                                                  "order", "group"]).set_index("sequence_id")
    self.feature_annotation = pd.DataFrame(columns=["feature_id", "locus_id", "coordinates", "feature_type", "name",
                                                    "group", "group_type", "category", "fill_colour",
                                                    "stroke_colour",
                                                    "show_label"]).set_index("feature_id")
    self.prms = parameters

__load_annotation_file(file_path, annotation_columns, index_column)

Private method to load an annotation file.

Parameters:

  • file_path (str) –

    File path for an annotation file to be loaded.

  • annotation_columns (list) –

    List of columns that should be considered.

  • index_column (str) –

    Column name to be considered as index.

Returns:

  • DataFrame

    pd.DataFrame: Preprocessed annotation file.

Source code in lovis4u/DataProcessing.py
def __load_annotation_file(self, file_path: str, annotation_columns: list, index_column: str) -> pd.DataFrame:
    """Private method to load an annotation file.

    Arguments:
        file_path (str): File path for an annotation file to be loaded.
        annotation_columns (list): List of columns that should be considered.
        index_column (str): Column name to be considered as index.

    Returns:
          pd.DataFrame: Preprocessed annotation file.
    """
    annotation_table = pd.read_table(file_path)
    found_allowed_columns = [i for i in annotation_columns if i in annotation_table.columns]
    not_found_allowed_columns = [i for i in annotation_columns if i not in annotation_table.columns]
    annotation_table = annotation_table[found_allowed_columns].set_index(index_column)
    annotation_table[not_found_allowed_columns] = None
    return annotation_table

__update_feature_annotation(feature_id, locus_id, coordinates, feature_type, category, name)

Private method for updating feature annotation.

Parameters:

  • feature_id (str) –

    Feature identifier.

  • locus_id (str) –

    Sequence description.

  • coordinates (str) –

    Feature coordinates.

  • category (str) –

    Feature type.

  • name (str) –

    Feature name.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def __update_feature_annotation(self, feature_id: str, locus_id: str, coordinates: str, feature_type: str,
                                category: str, name: str) -> None:
    """Private method for updating feature annotation.

    Arguments:
        feature_id (str): Feature identifier.
        locus_id (str): Sequence description.
        coordinates (str): Feature coordinates.
        category (str): Feature type.
        name (str): Feature name.


    Returns:
        None

    """
    if feature_id not in self.feature_annotation.index:
        self.feature_annotation.loc[feature_id] = {col: None for col in self.feature_annotation.columns}

    if self.feature_annotation.loc[feature_id]["group_type"] in self.prms.args["feature_group_types_to_show_label"] \
            or "all" in self.prms.args["feature_group_types_to_show_label"]:
        show_label = 1
    else:
        show_label = 0
    stroke_colour = "default"
    if self.prms.args["set_feature_stroke_colour_based_on_fill_colour"] and \
            self.feature_annotation.loc[feature_id]["fill_colour"] and \
            self.feature_annotation.loc[feature_id]["fill_colour"] != "default":
        stroke_colour = lovis4u.Methods.scale_lightness(self.feature_annotation.loc[feature_id]["fill_colour"],
                                                        self.prms.args["feature_stroke_colour_relative_lightness"])
    default_values = dict(locus_id=locus_id, coordinates=coordinates, feature_type=feature_type,
                          name=name, group="", group_type="undefined", category=category, fill_colour="default",
                          stroke_colour=stroke_colour,
                          show_label=show_label)
    self.feature_annotation.loc[feature_id] = self.feature_annotation.loc[feature_id].fillna(default_values)
    return None

__update_locus_annotation(record_id, record_description, record_length)

Private method for updating loci annotation.

Parameters:

  • record_id (str) –

    Sequence identifier.

  • record_description (str) –

    Sequence description.

  • record_length (int) –

    Sequence length.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def __update_locus_annotation(self, record_id: str, record_description: str, record_length: int) -> None:
    """Private method for updating loci annotation.

    Arguments:
        record_id (str): Sequence identifier.
        record_description (str): Sequence description.
        record_length (int): Sequence length.

    Returns:
        None

    """
    if record_id not in self.locus_annotation.index:
        self.locus_annotation.loc[record_id] = {col: None for col in self.locus_annotation.columns}
    if self.prms.args["windows"]:
        coordinates = []
        for window in self.prms.args["windows"]:
            subwindows = window.strip().split(",")
            for subwindow in subwindows:
                subwindow_split = subwindow.strip().split(":")
                if len(subwindow_split) == 4 and subwindow_split[0] == record_id:
                    coordinate = ":".join(subwindow_split[1:])
                    coordinates.append(coordinate)
        coordinates = ",".join(coordinates)
    else:
        coordinates = f"1:{record_length}:1"
    default_values = dict(length=record_length, coordinates=coordinates,
                          description=record_description, circular=1, order=len(self.loci), group=1)
    self.locus_annotation.loc[record_id] = self.locus_annotation.loc[record_id].fillna(default_values)
    return None

cluster_sequences(dataframe, one_cluster)

Define loci order and clusters with proteome similarity based hierarchical clustering. This function changes the order of loci that are plotted and also updates corresponding to each loci group attribute which defines homologues groups of proteomes.

It's designed to use as input mmseqs_cluster() function results. However, if you have obtained homologues groups by other method you can also build pandas dataframe based on that with index corresponding to feature id and column "cluster" corresponding to the group.

Parameters:

  • dataframe (DataFrame) –

    dataframe with feature id - group pairs. Its index column should represent all loci CDS features. one_cluster (bool): consider all sequences to be members of one cluster, but still define the optimal order.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def cluster_sequences(self, dataframe: pd.DataFrame, one_cluster: bool) -> None:
    """Define loci order and clusters with proteome similarity based hierarchical clustering.
        This function changes the order of loci that are plotted and also updates corresponding to each loci group
        attribute which defines homologues groups of proteomes.

    It's designed to use as input mmseqs_cluster() function results. However, if you have obtained homologues
        groups by other method you can also build pandas dataframe based on that with index corresponding to
        feature id and column "cluster" corresponding to the group.

    Arguments:
          dataframe (pd.DataFrame): dataframe with feature id - group pairs. Its index column should
            represent all loci CDS features.
            one_cluster (bool): consider all sequences to be members of one cluster, but still define the
            optimal order.

    Returns:
        None

    """
    try:
        proteins_loci_dict = collections.defaultdict(collections.deque)
        loci_clusters_dict = dict()
        number_of_loci = len(self.loci)
        if number_of_loci < 2:
            return None
        proteome_sizes = pd.Series(np.zeros(number_of_loci, dtype=int))
        for locus_index in range(number_of_loci):
            locus = self.loci[locus_index]
            loci_clusters = [dataframe.loc[feature.feature_id, "cluster"] for feature in locus.features if
                             feature.feature_type == "CDS"]
            loci_clusters_dict[locus_index] = list(set(loci_clusters))
            proteome_sizes.iloc[locus_index] = len(set(loci_clusters))
            for l_cl in loci_clusters:
                proteins_loci_dict[l_cl].append(locus_index)

        loci_ids = [locus.seq_id for locus in self.loci]
        similarity_matrix = pd.DataFrame(0.0, index=loci_ids, columns=loci_ids)
        for locus_index in range(number_of_loci):
            counts = pd.Series(np.zeros(number_of_loci, dtype=int))
            for cluster in loci_clusters_dict[locus_index]:
                js = proteins_loci_dict[cluster]
                counts.iloc[js] += 1
            locus_size = proteome_sizes[locus_index]
            norm_factors = pd.Series(0.5 * (locus_size + proteome_sizes) / (locus_size * proteome_sizes),
                                     index=counts.index)
            weights = counts.mul(norm_factors)
            similarity_matrix.iloc[locus_index] = weights
        symmetric_distance_matrix = 1 - similarity_matrix
        np.fill_diagonal(symmetric_distance_matrix.values, 0)
        linkage_matrix = scipy.cluster.hierarchy.linkage(
            scipy.spatial.distance.squareform(symmetric_distance_matrix),
            method="average")
        dendrogram = scipy.cluster.hierarchy.dendrogram(linkage_matrix, no_plot=True)
        if not one_cluster:
            clusters = pd.Series(scipy.cluster.hierarchy.fcluster(linkage_matrix,
                                                                  self.prms.args["clustering_h_value"],
                                                                  criterion="distance"),
                                 index=loci_ids)
            for locus in self.loci:
                locus.group = clusters[locus.seq_id]
                self.locus_annotation.loc[locus.seq_id, "group"] = locus.group
        order = dendrogram["leaves"][::-1]
        self.locus_annotation["initial_order"] = self.locus_annotation["order"]
        for locus_index in range(number_of_loci):
            locus = self.loci[locus_index]
            self.locus_annotation.loc[locus.seq_id, "order"] = order.index(locus_index)
        self.locus_annotation.sort_values(by="order", inplace=True)
        seq_id_to_order = self.locus_annotation["order"].to_dict()
        self.loci.sort(key=lambda locus: seq_id_to_order[locus.seq_id])

        reordered_similarity_matrix = similarity_matrix.reindex(index=self.locus_annotation.index,
                                                                columns=self.locus_annotation.index)
        if not os.path.exists(self.prms.args["output_dir"]):
            os.mkdir(self.prms.args["output_dir"])
        file_path = os.path.join(self.prms.args["output_dir"], "proteome_similarity_matrix.tsv")
        reordered_similarity_matrix.to_csv(file_path, sep="\t")
        num_of_loci_groups = len(set(self.locus_annotation["group"].to_list()))
        if self.prms.args["verbose"]:
            if num_of_loci_groups == 1:
                print(f"⦿ Loci order and {num_of_loci_groups} cluster was defined with proteome similarity based "
                      f"hierarchical clustering", file=sys.stdout)
            elif num_of_loci_groups > 1:
                print(f"⦿ Loci order and {num_of_loci_groups} clusters were defined with proteome similarity based "
                      f"hierarchical clustering", file=sys.stdout)
            print(f"⦿ Proteome similarity matrix of loci was saved to {file_path}", file=sys.stdout)
        return None
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to cluster loci sequences.") from error

define_feature_groups(dataframe, group_column_name='cluster')

Set features attribute "group" based on input dataframe.

By default it is designed to use mmseqs_cluster() function results as input. If you already have precomputed feature groups you can set them with feature table.

Parameters:

  • dataframe (DataFrame) –

    dataframe with feature id - group pairs. Its index column should represent all loci CDS features.

  • group_column_name (str, default: 'cluster' ) –

    column name of the dataframe that represent corresponding group to each feature.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def define_feature_groups(self, dataframe: pd.DataFrame, group_column_name: str = "cluster") -> None:
    """Set features attribute "group" based on input dataframe.

    By default it is designed to use mmseqs_cluster() function results as input. If you already have precomputed
        feature groups you can set them with feature table.

    Arguments:
        dataframe (pd.DataFrame): dataframe with feature id - group pairs. Its index column should
            represent all loci CDS features.
        group_column_name (str): column name of the dataframe that represent corresponding group to each feature.

    Returns:
        None

    """
    try:

        for locus in self.loci:
            for feature in locus.features:
                if feature.group and self.prms.args["keep_predefined_groups"]:
                    continue
                if feature.feature_type == "CDS":
                    feature.group = dataframe.loc[feature.feature_id, group_column_name]
                else:
                    feature.group = feature.name
                self.feature_annotation.loc[feature.feature_id, "group"] = feature.group
        return None
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to define protein features groups.") from error

define_labels_to_be_shown()

Set feature visaulisation attribute "show_label" based on feature groups.

controlled by feature_labels_to_ignore, feature_group_types_to_show_label, and feature_group_types_to_show_label_on_first_occurrence parameters.

Returns:

  • None

Source code in lovis4u/DataProcessing.py
def define_labels_to_be_shown(self):
    """Set feature visaulisation attribute "show_label" based on feature groups.

    controlled by feature_labels_to_ignore, feature_group_types_to_show_label, and
        feature_group_types_to_show_label_on_first_occurrence parameters.

    Returns:
        None

    """
    try:
        added_first_occurrence_labels = []
        for locus in self.loci:
            for feature in locus.features:
                if self.prms.args["show_all_feature_labels"]:
                    feature.vis_prms["show_label"] = 1
                    continue
                if feature.vis_prms["label"] not in self.prms.args["feature_labels_to_ignore"]:
                    if "any" in self.prms.args["feature_group_types_to_show_label"]:
                        feature.vis_prms["show_label"] = 1
                    elif feature.group_type in self.prms.args["feature_group_types_to_show_label"] or \
                            (feature.vis_prms["hmmscan_hit"] and self.prms.args["show_all_label_for_query_"
                                                                                "proteins"]):
                        feature.vis_prms["show_label"] = 1
                    elif (feature.group_type in \
                          self.prms.args["feature_group_types_to_show_label_on_first_occurrence"]) or \
                            (feature.vis_prms["hmmscan_hit"] and self.prms.args["show_label_on_first_occurrence"
                                                                                "_for_query_proteins"]):
                        if feature.group not in added_first_occurrence_labels or len(self.loci) == 1:
                            feature.vis_prms["show_label"] = 1
                            added_first_occurrence_labels.append(feature.group)
                else:
                    feature.vis_prms["show_label"] = 0
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable define feature labels to be shown.") from error

find_variable_feature_groups(mmseqs_results)

Define feature group type attributes (variable or conserved) based on their conservation in corresponding loci group feature.

It's designed to use as input mmseqs_cluster() function results. However, if you have obtained homologues groups by other method you can also build pandas dataframe based on that with index corresponding to feature id and column "cluster" corresponding to the group.

Parameters:

  • mmseqs_results (DataFrame) –

    dataframe with feature id - group pairs. Its index column should represent all loci CDS features.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def find_variable_feature_groups(self, mmseqs_results: pd.DataFrame) -> None:
    """Define feature group type attributes (variable or conserved) based on their conservation in corresponding
        loci group feature.

    It's designed to use as input mmseqs_cluster() function results. However, if you have obtained homologues
        groups by other method you can also build pandas dataframe based on that with index corresponding to
        feature id and column "cluster" corresponding to the group.

    Arguments:
          mmseqs_results (pd.DataFrame): dataframe with feature id - group pairs. Its index column should
            represent all loci CDS features.

    Returns:
        None

    """
    try:
        loci_clusters_sizes = self.locus_annotation["group"].value_counts()
        loci_clusters_cutoff_v = np.round(self.prms.args["CDS_is_variable_cutoff"] * loci_clusters_sizes).astype(
            int)
        loci_clusters_cutoff_c = np.round(self.prms.args["CDF_is_conserved_cutoff"] * loci_clusters_sizes).astype(
            int)
        loci_clusters_cutoff_v[loci_clusters_cutoff_v == 0] = 1
        cluster_types = collections.defaultdict(dict)
        for cluster in set(mmseqs_results["cluster"].to_list()):
            cluster_proteins = mmseqs_results[mmseqs_results["cluster"] == cluster].index
            cluster_loci = [locus for locus in self.loci if
                            any(feature.feature_id in cluster_proteins for feature in locus.features)]
            cluster_loci_groups = [locus.group for locus in cluster_loci]
            for cluster_locus_group in cluster_loci_groups:
                current_group_cluster_loci = [locus.seq_id for locus in cluster_loci if
                                              locus.group == cluster_locus_group]
                current_group_cluster_size = len(set(current_group_cluster_loci))
                if loci_clusters_sizes[cluster_locus_group] > 1 and \
                        current_group_cluster_size <= loci_clusters_cutoff_v[cluster_locus_group]:
                    cluster_types[cluster_locus_group][cluster] = "variable"
                elif loci_clusters_sizes[cluster_locus_group] > 1 and \
                        (loci_clusters_cutoff_v[cluster_locus_group] < current_group_cluster_size <
                         loci_clusters_cutoff_c[cluster_locus_group]):
                    cluster_types[cluster_locus_group][cluster] = "intermediate"
                else:
                    cluster_types[cluster_locus_group][cluster] = "conserved"
        for locus in self.loci:
            locus_group = locus.group
            for feature in locus.features:
                if feature.feature_type == "CDS":
                    if feature.group_type and self.prms.args["keep_predefined_groups"]:
                        continue
                    feature.group_type = cluster_types[locus_group][feature.group]
                else:
                    feature.group_type = "noncoding"
                self.feature_annotation.loc[feature.feature_id, "group_type"] = feature.group_type
        return None
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to define variable feature groups.") from error

get_contig_sizes()

Helper method to get a dictionary with contig sizes for bedgraph file parsing

Returns:

  • dict

    dictionary with contig sizes

Source code in lovis4u/DataProcessing.py
def get_contig_sizes(self):
    """Helper method to get a dictionary with contig sizes for bedgraph file parsing

    Returns:
        dict: dictionary with contig sizes
    """
    try:
        contig_sizes = dict()
        for index, row in self.locus_annotation.iterrows():
            contig_sizes[index] = row["length"]
        return contig_sizes
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to get contig sizes dict.") from error

get_loci_lengths_and_n_of_regions()

Get loci lengths and number of regions.

Returns:

  • list ( list[list[int]] ) –

    list each element of each contains locus size and number of breaks for visualisation track.

Source code in lovis4u/DataProcessing.py
def get_loci_lengths_and_n_of_regions(self) -> list[list[int]]:
    """Get loci lengths and number of regions.

    Returns:
        list: list each element of each contains locus size and number of breaks for visualisation track.

    """
    try:
        loci_sizes = []
        for locus in self.loci:
            number_of_gaps = len(locus.coordinates) - 1
            if locus.circular:
                for i in range(number_of_gaps):
                    if locus.coordinates[i]["end"] == locus.length and locus.coordinates[i + 1]["start"] == 1:
                        number_of_gaps -= 1
            loci_sizes.append([locus.size, number_of_gaps])
        return loci_sizes
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to get loci lengths.") from error

load_feature_annotation_file(file_path)

Load features annotation file.

Parameters:

  • file_path (str) –

    File path for a features annotation file to be loaded.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def load_feature_annotation_file(self, file_path: str) -> None:
    """Load features annotation file.

    Arguments:
        file_path (str): File path for a features annotation file to be loaded.

    Returns:
        None

    """
    annotation_columns = ["feature_id", "locus_id", "coordinates", "feature_type", "name", "group", "group_type",
                          "category", "fill_colour", "stroke_colour", "show_label"]
    self.feature_annotation = self.__load_annotation_file(file_path, annotation_columns, "feature_id")
    return None

load_loci_from_extended_gff(input_f, ilund4u_mode=False)

Load loci from the folder with gff files. Each GFF file also should contain corresponding nucleotide sequence. Such files are produced for example by pharokka annotation tool.

All files with extension other than .gff (not case-sensitive) will be ignored.

Parameters:

  • input_folder

    folder name with gff files.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def load_loci_from_extended_gff(self, input_f: str, ilund4u_mode: bool = False) -> None:
    """Load loci from the folder with gff files. Each GFF file also should contain corresponding nucleotide
        sequence. Such files are produced for example by pharokka annotation tool.

    All files with extension other than .gff (not case-sensitive) will be ignored.

    Arguments:
        input_folder: folder name with gff files.

    Returns:
        None

    """

    try:
        if isinstance(input_f, str):
            input_folder = input_f
            if not os.path.exists(input_folder):
                raise lovis4u.Manager.lovis4uError(f"Folder/file {input_folder} does not exist.")
            if os.path.isfile(input_f):
                gff_files = [input_f]
            else:
                gff_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder)]
        elif isinstance(input_f, list):
            gff_files = input_f
        else:
            raise lovis4u.Manager.lovis4uError(f"The input for the GFF parsing function must be either a folder or "
                                               f"a list of files.")
        if not gff_files:
            raise lovis4u.Manager.lovis4uError(f"Folder {input_f} does not contain files.")
        if self.prms.args["verbose"]:
            print(f"○ Reading gff file{'s' if len(gff_files) > 1 else ''}...", file=sys.stdout)
        for gff_file in gff_files:
            try:
                gff_records = list(BCBio.GFF.parse(gff_file, limit_info=dict(gff_type=["CDS", "tRNA", "tmRNA",
                                                                                       "RNA",
                                                                                       "pseudogene", "rRNA",
                                                                                       "misc_RNA",
                                                                                       "ncRNA", "lncRNA"])))
                if len(gff_records) != 1:
                    print(f"○ Warning: gff file {gff_file} contains information for more than 1 "
                          f"sequence. File will be skipped.")
                    continue
                gff_record = gff_records[0]
                try:
                    record_locus_sequence = gff_record.seq
                except Bio.Seq.UndefinedSequenceError:
                    print(f"○ Warning: gff file {gff_file} doesn't contain corresponding sequences.")
                    continue
                if self.prms.args["gff_description_source"] in gff_record.annotations:
                    record_description = gff_record.annotations[self.prms.args["gff_description_source"]][0]
                    if isinstance(record_description, tuple):
                        record_description = " ".join(record_description)
                else:
                    record_description = ""
                if self.prms.args["use_filename_as_contig_id"]:
                    gff_record.id = os.path.splitext(os.path.basename(gff_file))[0]
                self.__update_locus_annotation(gff_record.id, record_description, len(record_locus_sequence))
                locus_annotation_row = self.locus_annotation.loc[gff_record.id]
                coordinates = [dict(zip(["start", "end", "strand"], map(int, c.split(":")))) for c in
                               locus_annotation_row["coordinates"].split(",")]
                record_locus = Locus(seq_id=gff_record.id, coordinates=coordinates,
                                     description=locus_annotation_row["description"],
                                     sequence=Bio.SeqRecord.SeqRecord(seq=record_locus_sequence, id=gff_record.id),
                                     circular=locus_annotation_row["circular"],
                                     length=locus_annotation_row["length"], parameters=self.prms, features=[],
                                     order=locus_annotation_row["order"])
                features_ids = [i.id for i in gff_record.features]
                if len(features_ids) != len(set(features_ids)):
                    raise lovis4u.Manager.lovis4uError(f"Gff file {gff_file} contains duplicated feature ids while"
                                                       f" only unique are allowed.")
                for gff_feature in gff_record.features:
                    feature_id = gff_feature.id
                    if ilund4u_mode or self.prms.args["add_locus_id_prefix"]:
                        if gff_record.id not in feature_id:
                            feature_id = f"{gff_record.id}-{feature_id}"
                    if gff_feature.type == "CDS":
                        transl_table = self.prms.args["default_transl_table"]
                        if "transl_table" in gff_feature.qualifiers.keys():
                            transl_table = int(gff_feature.qualifiers["transl_table"][0])
                        name = ""
                        if self.prms.args["gff_CDS_name_source"] in gff_feature.qualifiers:
                            name = gff_feature.qualifiers[self.prms.args["gff_CDS_name_source"]][0]
                        category = ""
                        if self.prms.args["gff_CDS_category_source"] in gff_feature.qualifiers:
                            category = ",".join(gff_feature.qualifiers[self.prms.args["gff_CDS_category_source"]])
                        sequence = gff_feature.translate(record_locus_sequence, table=transl_table,
                                                         cds=False)[:-1]
                    else:
                        name, category = "", ""
                        if self.prms.args["gff_noncoding_name_source"] in gff_feature.qualifiers:
                            name = self.prms.args["gff_noncoding_name_source"][0]
                        else:
                            for ans in self.prms.args["gff_noncoding_name_alternative_source"]:
                                if ans in gff_feature.qualifiers:
                                    name = gff_feature.qualifiers[ans][0]
                                    break
                        if not name:
                            name = feature_id
                        category = gff_feature.type
                        sequence = gff_feature.extract(record_locus_sequence)
                    for coordinate in record_locus.coordinates:
                        overlapping = False
                        start, end = coordinate["start"], coordinate["end"]
                        if start <= gff_feature.location.start + 1 <= end or start <= gff_feature.location.end <= end:
                            overlapping = True
                            break
                    if not overlapping and not self.prms.args["cluster_all_proteins"]:
                        continue
                    self.__update_feature_annotation(feature_id, record_locus.seq_id,
                                                     f"{int(gff_feature.location.start) + 1}:"
                                                     f"{int(gff_feature.location.end)}:{gff_feature.location.strand}",
                                                     gff_feature.type, category, name)
                    feature_annotation_row = self.feature_annotation.loc[feature_id]
                    feature = Feature(feature_type=feature_annotation_row["feature_type"],
                                      feature_id=feature_id, start=int(gff_feature.location.start) + 1,
                                      end=int(gff_feature.location.end), strand=gff_feature.location.strand,
                                      name=feature_annotation_row["name"],
                                      sequence=sequence,
                                      group=feature_annotation_row["group"],
                                      group_type=feature_annotation_row["group_type"],
                                      category=feature_annotation_row["category"],
                                      vis_prms=dict(fill_colour=feature_annotation_row["fill_colour"],
                                                    stroke_colour=feature_annotation_row["stroke_colour"],
                                                    show_label=feature_annotation_row["show_label"],
                                                    hmmscan_hit=0),
                                      overlapping=overlapping, parameters=self.prms)
                    record_locus.features.append(feature)
                self.loci.append(record_locus)
            except:
                print(f"○ Warning: gff file {gff_file} was not read properly and skipped")
                if self.prms.args["parsing_debug"]:
                    self.prms.args["debug"] = True
                    raise lovis4u.Manager.lovis4uError()
        seq_id_to_order = self.locus_annotation["order"].to_dict()
        loci_ids = [l.seq_id for l in self.loci]
        if len(loci_ids) != len(set(loci_ids)):
            raise lovis4u.Manager.lovis4uError(f"The input gff files have duplicated contig ids. "
                                               f"You can use `--use-filename-as-id` parameter to use file name "
                                               f"as contig id which can help to fix the problem.")
        feature_ids = [feature.feature_id for locus in self.loci for feature in locus.features]
        if len(feature_ids) != len(set(feature_ids)):
            raise lovis4u.Manager.lovis4uError(f"The input gff files have duplicated features ids. "
                                               f"You can use `--add-locus-id-prefix` (`-alip`) parameter to add "
                                               f"contig id prefix to each feature id which can help to "
                                               f"fix the problem.")
        self.loci.sort(key=lambda locus: seq_id_to_order[locus.seq_id])
        if self.prms.args["verbose"]:
            print(f"⦿ {len(self.loci)} {'locus was' if len(self.loci) == 1 else 'loci were'} loaded from the gff "
                  f"files folder", file=sys.stdout)
        return None
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to load loci from gff folder.") from error

load_loci_from_gb(input_folder)

Load loci from the folder with genbank files.

All files with extension other than .gb (not case-sensitive) will be ignored.

Parameters:

  • input_folder (str) –

    folder name with gb files.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def load_loci_from_gb(self, input_folder: str) -> None:
    """Load loci from the folder with genbank files.

    All files with extension other than .gb (not case-sensitive) will be ignored.

    Arguments:
        input_folder: folder name with gb files.

    Returns:
        None

    """
    if not os.path.exists(input_folder):
        raise lovis4u.Manager.lovis4uError(f"Folder/file {input_folder} does not exist.")
    try:
        if os.path.isfile(input_folder):
            gb_files = [input_folder]
        else:
            gb_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder)]
        if not gb_files:
            raise lovis4u.Manager.lovis4uError(f"Folder {input_folder} does not contain files.")
        if self.prms.args["verbose"]:
            print(f"○ Reading gb file{'s' if len(gb_files) > 1 else ''}...", file=sys.stdout)
        for gb_file in gb_files:
            try:
                gb_records = list(Bio.SeqIO.parse(gb_file, "genbank"))
                if len(gb_records) != 1:
                    print(f"○ Warning: gb file {gb_file} contains information for more than 1 "
                          f"sequence. File will be skipped.")
                    continue
                gb_record = gb_records[0]
                record_locus_sequence = gb_record.seq
                if self.prms.args["genbank_description_source"] == "description":
                    record_description = gb_record.description
                elif "annotations:" in self.prms.args["genbank_description_source"]:
                    feature_description_key = self.prms.args["genbank_description_source"].split(":")[1]
                    record_description = gb_record.annotations[feature_description_key]
                else:
                    record_description = ""
                if self.prms.args["use_filename_as_contig_id"]:
                    gb_record.id = os.path.splitext(os.path.basename(gb_file))[0]
                self.__update_locus_annotation(gb_record.id, record_description, len(record_locus_sequence))
                locus_annotation_row = self.locus_annotation.loc[gb_record.id]
                coordinates = [dict(zip(["start", "end", "strand"], map(int, c.split(":")))) for c in
                               locus_annotation_row["coordinates"].split(",")]
                record_locus = Locus(seq_id=gb_record.id, coordinates=coordinates,
                                     description=locus_annotation_row["description"],
                                     sequence=Bio.SeqRecord.SeqRecord(seq=record_locus_sequence, id=gb_record.id),
                                     circular=locus_annotation_row["circular"],
                                     length=locus_annotation_row["length"], parameters=self.prms, features=[],
                                     order=locus_annotation_row["order"])

                gb_CDSs = [i for i in gb_record.features if i.type == "CDS"]
                first_CDS_record = gb_CDSs[0]
                id_source = self.prms.args["genbank_id_source"]
                if self.prms.args["genbank_id_source"] not in first_CDS_record.qualifiers:
                    for alternative_id_source in self.prms.args["genbank_id_alternative_source"]:
                        if alternative_id_source in first_CDS_record.qualifiers:
                            id_source = alternative_id_source
                            if self.prms.args["verbose"]:
                                print(f"○ Warning: there is no <{self.prms.args['genbank_id_source']}> attribute "
                                      f"for CDS records in {gb_file}. Alternative <{id_source}> was used instead.",
                                      file=sys.stdout)
                            break
                    if id_source == self.prms.args["genbank_id_source"]:
                        print(f"There is no <{self.prms.args['genbank_id_source']}> "
                              f"attribute for CDS record found in {gb_file}. We tried to"
                              f" find any from the alternative list: "
                              f"{','.join(self.prms.args['genbank_id_alternative_source'])}"
                              f", but they also weren't found.")  # add about cmd parameter
                features_ids = [i.qualifiers[id_source][0] for i in gb_CDSs if id_source in i.qualifiers]
                if len(features_ids) != len(set(features_ids)):
                    print(f"GB file {gb_record} contains duplicated feature ids while"
                          f" only unique are allowed.")
                for gb_feature in gb_record.features:
                    if id_source not in gb_feature.qualifiers:
                        print(f"    ○ Warning: genbank feature for {gb_file} located at {gb_feature.location} "
                              f"was skipped\n    since it does not have id qualifier {id_source} found for other "
                              f"features.\n    it could be a case of zero length ORF. ", file=sys.stdout)
                        continue
                    feature_id = gb_feature.qualifiers[id_source][0].replace("|", "_")
                    if self.prms.args["add_locus_id_prefix"]:
                        if gb_record.id not in feature_id:
                            feature_id = f"{gb_record.id}-{feature_id}"
                    transl_table = self.prms.args["default_transl_table"]
                    if "transl_table" in gb_feature.qualifiers.keys():
                        transl_table = int(gb_feature.qualifiers["transl_table"][0])
                    name = ""
                    if gb_feature.type == "CDS":
                        if self.prms.args["genbank_CDS_name_source"] in gb_feature.qualifiers:
                            name = gb_feature.qualifiers[self.prms.args["genbank_CDS_name_source"]][0]
                        category = ""
                        if self.prms.args["genbank_CDS_category_source"] in gb_feature.qualifiers:
                            category = ",".join(
                                gb_feature.qualifiers[self.prms.args["genbank_CDS_category_source"]])
                        sequence = gb_feature.translate(record_locus_sequence, table=transl_table, cds=False)[:-1]
                    else:
                        name, category = "", ""
                        if self.prms.args["gff_noncoding_name_source"] in gb_feature.qualifiers:
                            name = self.prms.args["gff_noncoding_name_source"][0]
                        else:
                            for ans in self.prms.args["gff_noncoding_name_alternative_source"]:
                                if ans in gb_feature.qualifiers:
                                    name = gb_feature.qualifiers[ans][0]
                                    break
                        if not name:
                            name = feature_id
                        category = gb_feature.type
                        sequence = gb_feature.extract(record_locus_sequence)
                    for coordinate in record_locus.coordinates:
                        overlapping = False
                        start, end = coordinate["start"], coordinate["end"]
                        if start <= gb_feature.location.start + 1 <= end or start <= gb_feature.location.end <= end:
                            overlapping = True
                            break
                    if not overlapping and not self.prms.args["cluster_all_proteins"]:
                        continue
                    self.__update_feature_annotation(feature_id, record_locus.seq_id,
                                                     f"{int(gb_feature.location.start) + 1}:"
                                                     f"{int(gb_feature.location.end)}:"
                                                     f"{gb_feature.location.strand}", gb_feature.type, category,
                                                     name)
                    feature_annotation_row = self.feature_annotation.loc[feature_id]
                    feature = Feature(feature_type=feature_annotation_row["feature_type"],
                                      feature_id=feature_id, start=int(gb_feature.location.start) + 1,
                                      end=int(gb_feature.location.end),
                                      strand=gb_feature.location.strand,
                                      name=feature_annotation_row["name"],
                                      sequence=sequence,
                                      group=feature_annotation_row["group"],
                                      group_type=feature_annotation_row["group_type"],
                                      category=feature_annotation_row["category"],
                                      vis_prms=dict(fill_colour=feature_annotation_row["fill_colour"],
                                                    stroke_colour=feature_annotation_row["stroke_colour"],
                                                    show_label=feature_annotation_row["show_label"],
                                                    hmmscan_hit=0),
                                      overlapping=overlapping,
                                      parameters=self.prms)

                    record_locus.features.append(feature)
                self.loci.append(record_locus)
            except:
                print(f"○ Warning: gb file {gb_file} was not read properly and skipped")
                if self.prms.args["parsing_debug"]:
                    self.prms.args["debug"] = True
                    raise lovis4u.Manager.lovis4uError()
        seq_id_to_order = self.locus_annotation["order"].to_dict()
        loci_ids = [l.seq_id for l in self.loci]
        if len(loci_ids) != len(set(loci_ids)):
            raise lovis4u.Manager.lovis4uError(f"The input gb files have duplicated contig ids. "
                                               f"You can use `--use-filename-as-id` parameter to use file name "
                                               f"as contig id which can help to fix the problem.")
        feature_ids = [feature.feature_id for locus in self.loci for feature in locus.features]
        if len(feature_ids) != len(set(feature_ids)):
            raise lovis4u.Manager.lovis4uError(f"The input gb files have duplicated features ids. "
                                               f"You can use `--add-locus-id-prefix` (`-alip`) parameter to add "
                                               f"contig id prefix to each feature id which can help to "
                                               f"fix the problem.")
        self.loci.sort(key=lambda locus: seq_id_to_order[locus.seq_id])
        if self.prms.args["verbose"]:
            print(f"⦿ {len(self.loci)} {'locus was' if len(self.loci) == 1 else 'loci were'} loaded from the "
                  f"genbank files folder", file=sys.stdout)
        return None
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to load loci from gb folder.") from error

load_locus_annotation_file(file_path)

Load loci annotation file.

Parameters:

  • file_path (str) –

    File path for a loci annotation file to be loaded.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def load_locus_annotation_file(self, file_path: str) -> None:
    """Load loci annotation file.

    Arguments:
        file_path (str): File path for a loci annotation file to be loaded.

    Returns:
        None

    """
    annotation_columns = ["sequence_id", "length", "coordinates", "circular", "description", "order", "group"]
    self.locus_annotation = self.__load_annotation_file(file_path, annotation_columns, "sequence_id")
    return None

mmseqs_cluster()

Cluster all proteins using mmseqs in order to define groups of homologues.

Returns:

  • DataFrame

    pd.DataFrame: parsed mmseqs table (pandas dataframe) with columns: cluster, protein_id; where cluster is defined by representative sequence id within a corresponding cluster.

Source code in lovis4u/DataProcessing.py
def mmseqs_cluster(self) -> pd.DataFrame:
    """Cluster all proteins using mmseqs in order to define groups of homologues.

    Returns:
        pd.DataFrame: parsed mmseqs table (pandas dataframe) with columns: cluster, protein_id; where cluster is
            defined by representative sequence id within a corresponding cluster.

    """
    if self.prms.args["verbose"]:
        print(f"○ Running mmseqs for protein clustering...", file=sys.stdout)
    try:
        feature_records = [feature.record for locus in self.loci for feature in locus.features if
                           feature.feature_type == "CDS"]
        temp_input = tempfile.NamedTemporaryFile()
        Bio.SeqIO.write(feature_records, temp_input.name, "fasta")
        if not os.path.exists(self.prms.args["output_dir"]):
            os.mkdir(self.prms.args["output_dir"])
        mmseqs_output_folder = os.path.join(self.prms.args["output_dir"], "mmseqs")
        if os.path.exists(mmseqs_output_folder):
            shutil.rmtree(mmseqs_output_folder)
        os.mkdir(mmseqs_output_folder)
        Bio.SeqIO.write(feature_records, os.path.join(mmseqs_output_folder, "input_proteins.fa"), "fasta")
        mmseqs_output_folder_db = os.path.join(mmseqs_output_folder, "DB")
        os.mkdir(mmseqs_output_folder_db)
        mmseqs_stdout = open(os.path.join(mmseqs_output_folder, "mmseqs_stdout.txt"), "w")
        mmseqs_stderr = open(os.path.join(mmseqs_output_folder, "mmseqs_stderr.txt"), "w")
        subprocess.run([self.prms.args["mmseqs_binary"], "createdb", temp_input.name,
                        os.path.join(mmseqs_output_folder_db, "sequencesDB")], stdout=mmseqs_stdout,
                       stderr=mmseqs_stderr)
        subprocess.run([self.prms.args["mmseqs_binary"], "cluster",
                        os.path.join(mmseqs_output_folder_db, "sequencesDB"),
                        os.path.join(mmseqs_output_folder_db, "clusterDB"),
                        os.path.join(mmseqs_output_folder_db, "tmp"),
                        "--cluster-mode", str(self.prms.args["mmseqs_cluster_mode"]),
                        "--cov-mode", str(self.prms.args["mmseqs_cov_mode"]),
                        "--min-seq-id", str(self.prms.args["mmseqs_min_seq_id"]),
                        "-c", str(self.prms.args["mmseqs_c"]),
                        "-s", str(self.prms.args["mmseqs_s"])], stdout=mmseqs_stdout, stderr=mmseqs_stderr)
        subprocess.run([self.prms.args["mmseqs_binary"], "createtsv",
                        os.path.join(mmseqs_output_folder_db, "sequencesDB"),
                        os.path.join(mmseqs_output_folder_db, "sequencesDB"),
                        os.path.join(mmseqs_output_folder_db, "clusterDB"),
                        os.path.join(mmseqs_output_folder, "mmseqs_clustering.tsv")],
                       stdout=mmseqs_stdout, stderr=mmseqs_stderr)
        mmseqs_clustering_results = pd.read_table(os.path.join(mmseqs_output_folder, "mmseqs_clustering.tsv"),
                                                  sep="\t", header=None, names=["cluster", "protein_id"])
        mmseqs_clustering_results = mmseqs_clustering_results.set_index("protein_id")

        num_of_unique_clusters = len(set(mmseqs_clustering_results["cluster"].to_list()))
        num_of_proteins = len(mmseqs_clustering_results.index)
        if self.prms.args["verbose"]:
            print(f"⦿ {num_of_unique_clusters} clusters for {num_of_proteins} proteins were found with mmseqs\n"
                  f"\tmmseqs clustering results were saved to "
                  f"{os.path.join(mmseqs_output_folder, 'mmseqs_clustering.tsv')}", file=sys.stdout)
        return mmseqs_clustering_results
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to run mmseqs clustering. In case you use a linux machine"
                                           ",  have you run a post-install 'lovis4u --linux` command to switch to"
                                           " the linux mmseqs binary?") from error

pyhmmer_annotation()

Run pyhhmmer hmmscan against a set of databases for additional annotation of hotspot proteins.

Parameters:

  • proteomes (Proteomes) –

    Proteomes object.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def pyhmmer_annotation(self) -> None:
    """Run pyhhmmer hmmscan against a set of databases for additional annotation of hotspot proteins.

    Arguments:
        proteomes (Proteomes): Proteomes object.

    Returns:
        None

    """
    try:
        if self.prms.args["verbose"]:
            print(f"○ Preparing data for additional protein annotation with pyhmmer hmmscan...",
                  file=sys.stdout)

        cds_records = [feature.record for locus in self.loci for feature in locus.features if
                       feature.feature_type == "CDS"]
        hmmscan_output_folder = os.path.join(self.prms.args["output_dir"], "hmmscan")
        if os.path.exists(hmmscan_output_folder):
            shutil.rmtree(hmmscan_output_folder)
        os.mkdir(hmmscan_output_folder)
        hmmscan_input_fasta_file_path = os.path.join(hmmscan_output_folder, "input_proteins.fa")
        Bio.SeqIO.write(cds_records, hmmscan_input_fasta_file_path, "fasta")

        alignment_table = lovis4u.Methods.run_pyhmmer(hmmscan_input_fasta_file_path, len(cds_records), self.prms)
        if not alignment_table.empty:
            found_hits_for = alignment_table.index.to_list()
            for locus in self.loci:
                for feature in locus.features:
                    if feature.feature_id in found_hits_for:
                        alignment_table_row = alignment_table.loc[feature.feature_id]
                        if self.prms.args["update_category_with_database_name"]:
                            feature.category = alignment_table_row["target_db"]
                        if self.prms.args["update_protein_name_with_target_name"]:
                            feature.name = alignment_table_row["target"]
                            feature.vis_prms["label"] = feature.name
                        feature.vis_prms["hmmscan_hit"] = 1
        return None
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to run pyhmmer hmmscan annotation.") from error

remove_non_overlapping_features()

Removes features that are not overlapping with visualisation window.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def remove_non_overlapping_features(self) -> None:
    """Removes features that are not overlapping with visualisation window.

    Returns:
        None
    """
    try:
        ids_of_non_overlapping_objects = []
        for locus in self.loci:
            ids_of_non_overlapping_objects += [obj.feature_id for obj in locus.features if not obj.overlapping]
            filtered_objects = [obj for obj in locus.features if obj.overlapping]
            locus.features = filtered_objects
        if ids_of_non_overlapping_objects:
            print("○ Warning message: LoVis4u clusters all proteins by default to define their classes"
                  "\n\t('variable' or 'conserved'), including those outside the visualisation window."
                  "\n\tTo cluster only proteins within the visualised area, use the -cl-owp parameter.")
        self.feature_annotation = self.feature_annotation.drop(ids_of_non_overlapping_objects)
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to clean non overlapping features.") from error

reorient_loci(ilund4u_mode=False)

Auto re-orient loci (reset strands) of loci if they are not matched.

Function tries to maximise co-orientation of homologous features.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def reorient_loci(self, ilund4u_mode: bool = False) -> None:
    """Auto re-orient loci (reset strands) of loci if they are not matched.

    Function tries to maximise co-orientation of homologous features.

    Returns:
        None

    """
    try:
        count_of_changed_strands = 0
        loci = [locus for locus in self.loci]
        for locus_index in range(1, len(loci)):
            p_locus = loci[locus_index - 1]
            c_locus = loci[locus_index]
            p_locus_strands = list(set([c["strand"] for c in p_locus.coordinates]))
            c_locus_strands = list(set([c["strand"] for c in c_locus.coordinates]))
            if len(p_locus_strands) == 1 and len(c_locus_strands) == 1:
                if not ilund4u_mode:
                    pr_locus_features_groups = set([f.group for f in p_locus.features])
                    c_locus_features_groups = set([f.group for f in c_locus.features])
                else:
                    pr_locus_features_groups = set(
                        [f.group for f in p_locus.features if f.group_type == "conserved"])
                    c_locus_features_groups = set(
                        [f.group for f in c_locus.features if f.group_type == "conserved"])
                overlapped_f_groups = pr_locus_features_groups & c_locus_features_groups
                prl_strand, cl_strand = p_locus_strands[0], c_locus_strands[0]
                pr_locus_features_strands = {f.group: f.strand * prl_strand for f in p_locus.features if
                                             f.group in overlapped_f_groups}
                c_locus_features_strands = {f.group: f.strand * cl_strand for f in c_locus.features if
                                            f.group in overlapped_f_groups}
                codirection_score = 0

                for ovg in overlapped_f_groups:
                    codirection_score += pr_locus_features_strands[ovg] * c_locus_features_strands[ovg]
                if codirection_score < 0:
                    count_of_changed_strands += 1
                    annot_coordinates = []
                    for cc in loci[locus_index].coordinates:
                        cc["strand"] *= -1
                        annot_coordinates.append(f"{cc['start']}:{cc['end']}:{cc['strand']}")
                    loci[locus_index].coordinates = loci[locus_index].coordinates[::-1]
                    annot_coordinates = annot_coordinates[::-1]
                    self.locus_annotation.loc[c_locus.seq_id, "coordinates"] = ",".join(annot_coordinates)
            else:
                if self.prms.args["verbose"]:
                    print("○ Warning: loci reorientation cannot be applied for loci that have both strands in"
                          " pre-defined coordinates for visualisation")
        if self.prms.args["verbose"]:
            if count_of_changed_strands == 0:
                print(f"⦿ Orientation was not changed for any locus", file=sys.stdout)
            elif count_of_changed_strands == 1:
                print(f"⦿ Orientation was changed for 1 locus", file=sys.stdout)
            elif count_of_changed_strands > 1:
                print(f"⦿ Orientation was changed for {count_of_changed_strands} loci", file=sys.stdout)

        return None
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to define variable feature groups.") from error

save_feature_annotation_table()

Save feature annotation table to the output folder.

Output file name is feature_annotation_table.tsv

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def save_feature_annotation_table(self) -> None:
    """Save feature annotation table to the output folder.

    Output file name is feature_annotation_table.tsv

    Returns:
        None

    """
    try:
        if not os.path.exists(self.prms.args["output_dir"]):
            os.mkdir(self.prms.args["output_dir"])
        feature_ids = [feature.feature_id for locus in self.loci for feature in locus.features]
        self.feature_annotation = self.feature_annotation.loc[self.feature_annotation.index.isin(feature_ids)]
        file_path = os.path.join(self.prms.args["output_dir"], "feature_annotation_table.tsv")
        self.feature_annotation.to_csv(file_path, sep="\t", index_label="feature_id")
        if self.prms.args["verbose"]:
            print(f"⦿ Feature annotation table was saved to {file_path}", file=sys.stdout)
        return None
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to save feature annotation table.") from error

save_locus_annotation_table()

Save loci annotation table to the output folder.

Output file name is locus_annotation_table.tsv

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def save_locus_annotation_table(self) -> None:
    """Save loci annotation table to the output folder.

    Output file name is locus_annotation_table.tsv

    Returns:
        None

    """
    try:
        if not os.path.exists(self.prms.args["output_dir"]):
            os.mkdir(self.prms.args["output_dir"])
        locus_ids = [locus.seq_id for locus in self.loci]
        self.locus_annotation = self.locus_annotation.loc[self.locus_annotation.index.isin(locus_ids)]
        file_path = os.path.join(self.prms.args["output_dir"], "locus_annotation_table.tsv")
        self.locus_annotation.to_csv(file_path, sep="\t", index_label="sequence_id")
        if self.prms.args["verbose"]:
            print(f"⦿ Loci annotation table was saved to {file_path}", file=sys.stdout)
        return None
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to save loci annotation table.") from error

set_category_colours(use_table=True)

Define colours for each category.

Parameters:

  • use_table (bool, default: True ) –

    Bool value whether table with predefined colours should be used or not.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def set_category_colours(self, use_table: bool = True) -> None:
    """Define colours for each category.

    Arguments:
        use_table (bool): Bool value whether table with predefined colours should be used or not.

    Returns:
        None

    """
    try:
        colours_dict = dict()
        if use_table:
            colours_dict.update(
                pd.read_table(self.prms.args["category_colours"]).set_index("category")["colour"].to_dict())

        feature_categories = list(set([feature.category for locus in self.loci for feature in locus.features
                                       if feature.category and feature.category]))
        if not feature_categories:
            if self.prms.args["verbose"]:
                print("○ Warning: there are no feature categories to set colours", file=sys.stdout)
        colours_dict = {cat: col for cat, col in colours_dict.items() if cat in feature_categories}

        feature_categories = [ff for ff in feature_categories if ff not in colours_dict.keys()]
        number_of_unique_feature_functions = len(feature_categories)
        colours_rgb = seaborn.color_palette(self.prms.args["category_colour_seaborn_palette"],
                                            number_of_unique_feature_functions,
                                            desat=self.prms.args["category_colour_seaborn_desat"])
        random.shuffle(colours_rgb)
        colours = list(map(lambda x: matplotlib.colors.rgb2hex(x), colours_rgb))
        colours_dict.update({g: c for g, c in zip(list(feature_categories), colours)})
        for locus in self.loci:
            locus.category_colours = colours_dict
        return None
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to set category colours.") from error

set_feature_colours_based_on_groups()

Define features fill colour based on corresponding feature group and group types.

Returns:

  • None

    None

Source code in lovis4u/DataProcessing.py
def set_feature_colours_based_on_groups(self) -> None:
    """Define features fill colour based on corresponding feature group and group types.

    Returns:
        None

    """
    try:
        feature_groups = set([feature.group for locus in self.loci for feature in locus.features
                              if (feature.group and feature.feature_type == "CDS")])
        if self.prms.args["feature_group_types_to_set_colour"] and \
                "all" not in self.prms.args["feature_group_types_to_set_colour"]:
            feature_groups = set([feature.group for locus in self.loci for feature in locus.features
                                  if feature.group and feature.feature_type == "CDS" and feature.group_type in
                                  self.prms.args["feature_group_types_to_set_colour"]])
        number_of_unique_feature_groups = len(feature_groups)
        if self.prms.args["groups_fill_colour_palette_lib"] == "seaborn":
            colours_rgb = seaborn.color_palette(self.prms.args["groups_fill_colour_seaborn_palette"],
                                                number_of_unique_feature_groups,
                                                desat=self.prms.args["groups_fill_colour_seaborn_desat"])
            random.shuffle(colours_rgb)
        elif self.prms.args["groups_fill_colour_palette_lib"] == "distinctipy":
            colours_rgb = distinctipy.get_colors(number_of_unique_feature_groups,
                                                 exclude_colours=[(1, 1, 1), (0, 0, 0)],
                                                 pastel_factor=self.prms.args["groups_fill_colours_pastel_factor"])
        colours = list(map(lambda x: matplotlib.colors.rgb2hex(x), colours_rgb))
        colours_dict = {g: c for g, c in zip(list(feature_groups), colours)}
        for locus in self.loci:
            for feature in locus.features:
                if feature.group in feature_groups and feature.feature_type == "CDS":
                    if self.prms.args["keep_predefined_colours"] and feature.vis_prms["fill_colour"] != "default":
                        continue
                    feature.vis_prms["fill_colour"] = colours_dict[feature.group]
                    self.feature_annotation.loc[feature.feature_id, "fill_colour"] = feature.vis_prms["fill_colour"]
        return None
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to set feature colours based on groups.") from error

Locus

A Locus object represents a particular locus that will be one of the sequence tracks on final figure.

Attributes:

  • seq_id (str) –

    Sequence identifier. Can be used to label locus.

  • coordinates (list) –

    List of regions to be shown. Each region format: dict with keys: start, end, strand and corresponding 1-based start, end coordinates and strand (1: plus strand, -1: minus strand).

  • size (int) –

    total length of regions to be plotted.

  • description (str) –

    Sequence description that can be used to label locus.

  • length (int) –

    full length of the locus independent on region that should be plotted.

  • circular (bool) –

    Bool value whether locus is circular or not. It defines whether you have gap or not passing 1 value on the final figure.

  • features (list) –

    list of Feature objects that overlapped with coordinates.

  • order (int) –

    index on ordered list of loci visualisation. Can be pre-defined or found based on loci hierarchical clustering with cluster_sequences category.

  • group (int | str) –

    locus group that defines set of closely-related loci.

  • category_colours (dict) –

    colours for locus' features categories.

  • prms (Parameters) –

    Parameters' class object that holds config and cmd arguments.

Source code in lovis4u/DataProcessing.py
class Locus:
    """A Locus object represents a particular locus that will be one of the sequence tracks on final figure.

    Attributes:
        seq_id (str): Sequence identifier. Can be used to label locus.
        coordinates (list): List of regions to be shown. Each region format: dict with keys: start, end, strand and
            corresponding 1-based start, end coordinates and strand (1: plus strand, -1: minus strand).
        size (int): total length of regions to be plotted.
        description (str): Sequence description that can be used to label locus.
        length (int): full length of the locus independent on region that should be plotted.
        circular (bool): Bool value whether locus is circular or not. It defines whether you have gap or not passing
            1 value on the final figure.
        features (list): list of Feature objects that overlapped with coordinates.
        order (int): index on ordered list of loci visualisation. Can be pre-defined or found based on loci
            hierarchical clustering with cluster_sequences category.
        group (int | str): locus group that defines set of closely-related loci.
        category_colours (dict): colours for locus' features categories.
        prms (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.

    """

    def __init__(self, seq_id: str, coordinates: list, description: str, sequence: Bio.Seq.Seq, length: int,
                 circular: bool, features: list, order: int, parameters: lovis4u.Manager.Parameters,
                 group: typing.Union[int, str] = 1):
        """Create a Locus object.

        Arguments:
            seq_id (str): Sequence identifier. Can be used to label locus.
            coordinates (list): List of regions to be shown. Each region format: dict with keys: start, end, strand and
                corresponding 1-based start, end coordinates and strand (1: plus strand, -1: minus strand).
            description (str): Sequence description that can be used to label locus.
            sequence (Bio.Seq.Seq): Locus Sequence.
            length (int): full length of the locus independent on region that should be plotted.
            circular (bool): Bool value whether locus is circular or not. It defines whether you have gap or not passing
                1 value on the final figure.
            features (list): list of Feature objects that overlapped with coordinates.
            order (int): index on ordered list of loci visualisation. Can be pre-defined or found based on loci
            hierarchical clustering with cluster_sequences category.
            parameters (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.
            group (int | str): locus group that defines set of closely-related loci [1].

        """
        self.seq_id = seq_id
        self.coordinates = coordinates
        self.size = 0
        taken_coordinates = []
        for coordinate in coordinates:
            self.size += abs(coordinate["end"] - coordinate["start"] + 1)
            taken_coordinates += list(range(coordinate["start"], coordinate["end"] + 1))
        if len(taken_coordinates) != len(set(taken_coordinates)):
            raise lovis4u.Manager.lovis4uError(f"Specified coordinates seems to be overlapped"
                                               f" or not in 0-based format.")
        for coordinate in coordinates:
            if coordinate["start"] < 1:
                raise lovis4u.Manager.lovis4uError(f"Coordinates for {seq_id}: {coordinate} is in 0-based format"
                                                   f" while input should be in 1-based.")
            if coordinate["end"] > length:
                raise lovis4u.Manager.lovis4uError(f"Coordinates for {seq_id}: {coordinate} is out of sequence length"
                                                   f" ({length} nt).")

        self.description = description
        self.sequence = sequence
        self.length = length
        self.circular = circular
        self.features = features
        self.order = order
        self.group = group
        self.category_colours = dict()
        self.prms = parameters

    def calculate_sequence_property(self, window_size: int = 29, property_name: str = "gc") -> list:
        """Method to calculate gc content or gc skew

        Returns:
            list: list with corresponding values for each nucleotide

        """
        try:
            result_list = []
            locus_length = len(self.sequence)
            for i in range(locus_length):
                start = (i - window_size // 2) % locus_length
                end = (i + window_size // 2 + 1) % locus_length
                if start < end:
                    window_seq = self.sequence[start:end]
                else:
                    window_seq = self.sequence[start:] + self.sequence[:end]
                if property_name == "gc":
                    result_list.append(Bio.SeqUtils.gc_fraction(window_seq) * 100)
                elif property_name == "gc_skew":
                    result_list.append(Bio.SeqUtils.GC_skew(window_seq, len(window_seq))[0])
            return result_list
        except Exception as error:
            raise lovis4u.Manager.lovis4uError("Unable to calculate gc content for a sequence.") from error

__init__(seq_id, coordinates, description, sequence, length, circular, features, order, parameters, group=1)

Create a Locus object.

Parameters:

  • seq_id (str) –

    Sequence identifier. Can be used to label locus.

  • coordinates (list) –

    List of regions to be shown. Each region format: dict with keys: start, end, strand and corresponding 1-based start, end coordinates and strand (1: plus strand, -1: minus strand).

  • description (str) –

    Sequence description that can be used to label locus.

  • sequence (Seq) –

    Locus Sequence.

  • length (int) –

    full length of the locus independent on region that should be plotted.

  • circular (bool) –

    Bool value whether locus is circular or not. It defines whether you have gap or not passing 1 value on the final figure.

  • features (list) –

    list of Feature objects that overlapped with coordinates.

  • order (int) –

    index on ordered list of loci visualisation. Can be pre-defined or found based on loci

  • parameters (Parameters) –

    Parameters' class object that holds config and cmd arguments.

  • group (int | str, default: 1 ) –

    locus group that defines set of closely-related loci [1].

Source code in lovis4u/DataProcessing.py
def __init__(self, seq_id: str, coordinates: list, description: str, sequence: Bio.Seq.Seq, length: int,
             circular: bool, features: list, order: int, parameters: lovis4u.Manager.Parameters,
             group: typing.Union[int, str] = 1):
    """Create a Locus object.

    Arguments:
        seq_id (str): Sequence identifier. Can be used to label locus.
        coordinates (list): List of regions to be shown. Each region format: dict with keys: start, end, strand and
            corresponding 1-based start, end coordinates and strand (1: plus strand, -1: minus strand).
        description (str): Sequence description that can be used to label locus.
        sequence (Bio.Seq.Seq): Locus Sequence.
        length (int): full length of the locus independent on region that should be plotted.
        circular (bool): Bool value whether locus is circular or not. It defines whether you have gap or not passing
            1 value on the final figure.
        features (list): list of Feature objects that overlapped with coordinates.
        order (int): index on ordered list of loci visualisation. Can be pre-defined or found based on loci
        hierarchical clustering with cluster_sequences category.
        parameters (lovis4u.Manager.Parameters): Parameters' class object that holds config and cmd arguments.
        group (int | str): locus group that defines set of closely-related loci [1].

    """
    self.seq_id = seq_id
    self.coordinates = coordinates
    self.size = 0
    taken_coordinates = []
    for coordinate in coordinates:
        self.size += abs(coordinate["end"] - coordinate["start"] + 1)
        taken_coordinates += list(range(coordinate["start"], coordinate["end"] + 1))
    if len(taken_coordinates) != len(set(taken_coordinates)):
        raise lovis4u.Manager.lovis4uError(f"Specified coordinates seems to be overlapped"
                                           f" or not in 0-based format.")
    for coordinate in coordinates:
        if coordinate["start"] < 1:
            raise lovis4u.Manager.lovis4uError(f"Coordinates for {seq_id}: {coordinate} is in 0-based format"
                                               f" while input should be in 1-based.")
        if coordinate["end"] > length:
            raise lovis4u.Manager.lovis4uError(f"Coordinates for {seq_id}: {coordinate} is out of sequence length"
                                               f" ({length} nt).")

    self.description = description
    self.sequence = sequence
    self.length = length
    self.circular = circular
    self.features = features
    self.order = order
    self.group = group
    self.category_colours = dict()
    self.prms = parameters

calculate_sequence_property(window_size=29, property_name='gc')

Method to calculate gc content or gc skew

Returns:

  • list ( list ) –

    list with corresponding values for each nucleotide

Source code in lovis4u/DataProcessing.py
def calculate_sequence_property(self, window_size: int = 29, property_name: str = "gc") -> list:
    """Method to calculate gc content or gc skew

    Returns:
        list: list with corresponding values for each nucleotide

    """
    try:
        result_list = []
        locus_length = len(self.sequence)
        for i in range(locus_length):
            start = (i - window_size // 2) % locus_length
            end = (i + window_size // 2 + 1) % locus_length
            if start < end:
                window_seq = self.sequence[start:end]
            else:
                window_seq = self.sequence[start:] + self.sequence[:end]
            if property_name == "gc":
                result_list.append(Bio.SeqUtils.gc_fraction(window_seq) * 100)
            elif property_name == "gc_skew":
                result_list.append(Bio.SeqUtils.GC_skew(window_seq, len(window_seq))[0])
        return result_list
    except Exception as error:
        raise lovis4u.Manager.lovis4uError("Unable to calculate gc content for a sequence.") from error