# -*- coding: utf-8 -*-
"""
Created on Fri Aug 27 14:06:17 2021
@author: Shen Wanxiang
"""
import csv
from collections import defaultdict
from pprint import pprint
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage, to_tree
from scipy.spatial.distance import squareform
itol_header = '''TREE_COLORS
SEPARATOR TAB
#First 3 fields define the node id, type and color
#Possible types are:
#'range': defines a colored range (colored background for labels/clade)
#'clade': defines color/style for all branches in a clade
#'branch': defines color/style for a single branch
#'label': defines font color/style for the leaf label
#'label_background': defines the leaf label background color
#The following additional fields are required:
#for 'range', field 4 defines the colored range label (used in the legend)
#The following additional fields are optional:
#for 'label', field 4 defines the font style ('normal',''bold', 'italic' or 'bold-italic') and field 5 defines the numeric scale factor for the font size (eg. with value 2, font size for that label will be 2x the standard size)
#for 'clade' and 'branch', field 4 defines the branch style ('normal' or 'dashed') and field 5 defines the branch width scale factor (eg. with value 0.5, branch width for that clade will be 0.5 the standard width)
DATA
#NODE_ID TYPE COLOR LABEL_OR_STYLE SIZE_FACTOR
'''
def _getNewick(node, newick, parentdist, leaf_names):
if node.is_leaf():
return "%s:%.2f%s" % (leaf_names[node.id], parentdist - node.dist, newick)
else:
if len(newick) > 0:
newick = "):%.2f%s" % (parentdist - node.dist, newick)
else:
newick = ");"
newick = _getNewick(node.get_left(), newick, node.dist, leaf_names)
newick = _getNewick(node.get_right(), ",%s" % (newick), node.dist, leaf_names)
newick = "(%s" % (newick)
return newick
[docs]def mp2newick(mp, treefile = 'phenotype_tree', leaf_names = None):
df = mp.df_embedding[['colors','Subtypes']]
if leaf_names == None:
leaf_names = mp.flist #use final list only
else:
assert len(leaf_names) == len(df), 'leaf names should be a list with a length of %s' % len(df)
df.index = leaf_names
linkage_matrix = mp.Z
tree = to_tree(linkage_matrix, rd=False)
newick = _getNewick(tree, "", tree.dist, leaf_names = leaf_names)
# write newick file for itol
with open(treefile + '.nwk', 'w') as f:
f.write(newick)
# write dataset file for itol
df['TYPE'] = 'clade'
df['STYLE'] = 'normal'
df = df[['TYPE', 'colors', 'STYLE']]
with open(treefile + '.txt', 'w') as f:
f.write(itol_header)
df.to_csv(treefile + '.txt', mode = 'a', header=None, sep='\t')
return df
[docs]def tree():
return defaultdict(tree)
[docs]def tree_add(t, path):
for node in path:
t = t[node]
[docs]def pprint_tree(tree_instance):
def dicts(t): return {k: dicts(t[k]) for k in t}
pprint(dicts(tree_instance))
[docs]def dfs_to_tree(dfs):
t = tree()
for i in range(len(dfs)):
row = dfs.iloc[i].dropna().tolist()
tree_add(t, row)
return t
[docs]def tree_to_newick(root):
items = []
for k in root.keys():
s = ''
if len(root[k].keys()) > 0:
sub_tree = tree_to_newick(root[k])
if sub_tree != '':
s += '(' + sub_tree + ')'
s += k
items.append(s)
return ','.join(items)
[docs]def dfs_to_weightless_newick(dfs):
t = dfs_to_tree(dfs)
newick_tree = tree_to_newick(t)
return newick_tree
if __name__ == '__main__':
species_list = pd.read_csv('./species.list.csv', header=None,index_col=0)[1].to_list()
dfs = pd.Series(species_list).apply(lambda x: dict([i.split('__') for i in x.split('|')])).apply(pd.Series)
level_dict = {'k':'kingdom', 'p':'phylum', 'c':'class' ,'o':'order' ,'f':'family' ,'g': 'genus','s': 'species'}
dfs = dfs.rename(columns=level_dict)
nwk_string = dfs_to_weightless_newick(dfs)
with open("1.nwk", "w") as f:
f.write(nwk_string)