1
nCoV - discussion / covidcg.org's big file encoding Gisaid but hiding the names and not showing the NNNNN
« on: January 17, 2021, 02:23:13 am »Download and unzip https://storage.googleapis.com/ve-public/v1.4/data_package.json.gz
Then with node.js (on my laptop the file is too big for chrome)
Code: [Select]
var t = require('./data_package.json');
var fr_loc = {};
var c =t["geo_select_tree"].children[2].children[15].children; // list of French locations
for (var i=0;i<c.length;i++) {
if (c[i]) {
fr_loc[c[i].location_id]=c[i].label;
for (var j=0;j<c[i].children.length;j++) {
if (c[i].children[j]) fr_loc[c[i].children[j].location_id]=c[i].label;
}
}
}
var fr_sequences=[];
for (var i = 0;i < t.case_data.length; i++) {
if (fr_loc[t.case_data[i].location_id]) fr_sequences.push(t.case_data[i]);
}
var lab_count = {};
for (var i =0;i < fr_sequences.length; i++) {
var l = t.metadata_map.submitting_lab[fr_sequences[i].submitting_lab];
if (!lab_count[l]) lab_count[l]=0;
lab_count[l]++;
}
lab_count;
var SNP= {};
for (var n in t.metadata_map.dna_snp) {
if (t.metadata_map.dna_snp.hasOwnProperty(n)) {
SNP[t.metadata_map.dna_snp[n]]=n;
}
}
for (var i =0;i < fr_sequences.length; i++) {
var mut = [];
for (var j =0; j<fr_sequences[i].dna_snp_str.length;j++) {
mut.push(SNP[fr_sequences[i].dna_snp_str[j]]);
}
fr_sequences[i].mut = mut;
fr_sequences[i].loc = fr_loc[fr_sequences[i].location_id];
}
output : a table of 2753 sequences, the first one is
Code: [Select]
> fr_sequences[0]
{ 'Accession ID': '2d58d2cd',
collection_date: '2020-02-26',
submission_date: '2020-03-14',
gender: 0,
age_start: 36,
age_end: 37,
patient_status: 0,
passage: 0,
specimen: 5,
lineage: 'A.2',
clade: 'S',
sequencing_tech: 7,
assembly_method: 23,
comment_type: -1,
authors: 39,
originating_lab: 51,
submitting_lab: 30,
dna_snp_str: [ 28245, 28855, 4217, 14839, 15498, 18708, 19358, 19677 ],
gene_aa_snp_str: [ 14838, 4890, 18663, 18297, 19910, 605 ],
protein_aa_snp_str: [ 11687, 13956, 13590, 15422, 605, 15820 ],
location_id: 1315,
mut:
[ '8782|C|T',
'9477|T|A',
'14805|C|T',
'25553|C|T',
'25979|G|T',
'28144|T|C',
'28657|C|T',
'28863|C|T' ]
loc:'Grand-Est'}