I have the a large dataframe where I calculate the p value using a t-test for each row. I now want to have a boxplot of the row with the top ten of lowest p-values.
LeadSNPs = pd.unique(candidate_genes.LeadSNP) #rs3184504 rs531612
gene_counts_per_snp_df = pd.DataFrame.empty
save_path = "../figures/SM5_gene_counts/"
for LeadSNP_cnt, LeadSNP in enumerate(LeadSNPs):
print(LeadSNP)
candidate_genes_per_SNP = candidate_genes.Target[np.where(candidate_genes.LeadSNP==LeadSNP)[0]]
region = pd.unique(candidate_genes.Region[np.where(candidate_genes.LeadSNP==LeadSNP)[0]])
first_gene_flag = 1
for gene_cnt, target_gene in enumerate(candidate_genes_per_SNP):
gene_indexes = candidate_genes_per_SNP.index
PRE = candidate_genes['sumOfWeightedWeights (PRE)'][gene_indexes[gene_cnt]]
print(target_gene)
ensembl_id = get_ensembl_id(target_gene)
print(ensembl_id)
if pd.isnull(ensembl_id):
pass
else:
gene_counts_df = get_gene_counts_df(ensembl_id)
if gene_counts_df.shape[0]==0:
print('no ensemble id found in gene counts!')
else:
gene_counts_df = gene_counts_df.melt(id_vars=["Gene"], var_name='compartment', value_name='count')
gene_counts_df = reshape_gene_counts_df(gene_counts_df)
gene_counts_df['target_gene'] = target_gene
gene_counts_df['PRE'] = PRE
gene_counts_df['pval_ftest']= np.nan
pop3= gene_counts_df.loc[(gene_counts_df['target_gene']==target_gene) & (gene_counts_df['compartment']=='CSF_N')]['count']
pop4 = gene_counts_df.loc[(gene_counts_df['target_gene']==target_gene) & (gene_counts_df['compartment']=='PB_N')]['count']
pval1 = stats.ttest_ind(pop3, pop4)[1]
gene_counts_df.loc[(gene_counts_df['target_gene']==target_gene) & (gene_counts_df['compartment'].isin(['CSF_N','PB_N'])),"pval_ftest"]= pval_ftest
if first_gene_flag == 1:
gene_counts_per_snp_df = gene_counts_df
first_gene_flag = 0
else:
gene_counts_per_snp_df = pd.concat([gene_counts_per_snp_df, gene_counts_df])
gene_counts_per_snp_df['LeadSNP'] = LeadSNP
if LeadSNP_cnt == 0:
all_gene_counts = gene_counts_per_snp_df
else:
all_gene_counts = pd.concat([all_gene_counts, gene_counts_per_snp_df])
all_gene_counts = pd.DataFrame.reset_index(all_gene_counts)
plot_top_genes_snps(all_gene_counts_per_comp, 'target_gene')
and the plotting code is given here
def plot_top_genes_snps(all_gene_counts_per_comp, x_label):
sns.set(style="white")
sns.set_context("poster")
palette = sns.color_palette("colorblind", 10)
fig, ax = plt.subplots(figsize=(25,4))
g = sns.boxplot(ax=ax, y='count', x=x_label, data=all_gene_counts_per_comp, hue = 'compartment', showfliers=False, palette=palette, hue_order=comp_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
handles, _ = ax.get_legend_handles_labels()
current_legends = []
for str_ind in range(len(handles)):
current_legends.append(comp_dict[handles[str_ind].get_label()])
ax.legend(handles, current_legends, bbox_to_anchor=(1, 1), loc=2)
ax.yaxis.grid()
sns.set(font_scale = 2)
plt.xlabel('')
plt.ylabel('Gene count')
# plt.savefig(save_path+str(LeadSNP)+'.pdf', bbox_inches='tight')
plt.show()
For context, I want the top ten 'target_gene' with the lowest p values. However this is the plot I am getting
How do I extract only the ten lowest pvalues and box plot them.
will give you the top 10 rows with the smallest
"pval_ftest"
value.Maybe this toy example will make it clearer how to sort and select subsets of a DataFrame.