#WORKUNIT('name', 'Pseudomonas Gene'); import std; // Name: Eric Bales // Email: eric.bales@lnssi.com // // Data Source: http://www.pseudomonas.com/download.jsp // // Instructions: // 1. Download one of the Chromosome data files in CSV format from the data source URL above. // The code should work for any of them. // 2. Transfer the file to the Landing Zone. // 3. Set the IP address for the Landing Zone here: LandingZone_IP := '10.220.5.57'; // 4. Set the folder containing the data file on the Landing Zone here: (Should end in / ) BaseDataDirectory := '/ftp/in/Pseudomonas/'; // 5. Set the filename for the specific data file you downloaded here: Filename := 'NC_002516.csv'; // 6. Execute this BWR. It will read the data from the Landing Zone, distribute it, and perform // a simple lookup within that data. // // Potential error: // Other data files might need a larger MAXLENGTH. If you don't see all of the records it // may be necessary to increase this value. MaxRecordLength := 18000; // // ---------------------------------------------------------------------------------------- // Data declarations for Psudomonas Gene data: // The data in this file contains child records within each row. // // Record definition for child dataset holding the Gene Ontology Mappings. L_GeneOntologyMapping := RECORD STRING NameSpace; STRING Accession; STRING Term; STRING EvidenceCode; END; // Record definition for child dataset holding the TIGRFAM Predictions. L_TIGRFAMPredictions := RECORD STRING Accession; STRING Name; STRING Func; STRING IsologyType; STRING Significance; STRING ECNumber; STRING Role; STRING SubRole; END; // Record definition for child dataset holding the PFAM Predictions. L_PFAMPredictions := RECORD STRING Accession; STRING Name; UNSIGNED8 Start; UNSIGNED8 Stop; STRING EValue; END; // Record definition for child dataset holding the COG Predictions. L_COGPredictions := RECORD STRING Accession; STRING Name; END; // Primary record. L_Pseudomonas := RECORD,MAXLENGTH(MaxRecordLength) STRING Strain; STRING Replicon; STRING RepliconName; STRING LocusTag; STRING FeatureType; UNSIGNED8 Start; UNSIGNED8 Stop; STRING Strand; STRING GeneName; STRING ProductName; STRING ProductNameRating; STRING RefseqAccession; UNSIGNED8 GI; STRING Entrez; SET OF STRING AlternateGeneNames; SET OF STRING AlternateProductNames; STRING Comments; SET OF STRING References; STRING GenomicContext; SET OF STRING Homology; SET OF STRING StructureFeatures; SET OF STRING Pathways; STRING ECNumber; STRING PseudoCAPFunctionClass; STRING SubcellularLocalization; DATASET(L_GeneOntologyMapping) GeneOntologyMappings; DATASET(L_TIGRFAMPredictions) TIGRFAMPredictions; DATASET(L_PFAMPredictions) PFAMPredictions; DATASET(L_COGPredictions) COGPredictions; STRING DNASequence; STRING ProteinSequence; END; // ---------------------------------------------------------------------------------------- // Read the data from the Landing Zone. // Filenames for the raw files on the landing zone, including path. Raw_PseudomonasData := BaseDataDirectory+Filename; // Raw ingest format (before parsing out child sets and datasets). L_Pseudomonas_Raw := RECORD,MAXLENGTH(MaxRecordLength) STRING Strain; STRING Replicon; STRING RepliconName; STRING LocusTag; STRING FeatureType; UNSIGNED8 Start; UNSIGNED8 Stop; STRING Strand; STRING GeneName; STRING ProductName; STRING ProductNameRating; STRING RefseqAccession; UNSIGNED8 GI; STRING Entrez; STRING AlternateGeneNames; STRING AlternateProductNames; STRING Comments; STRING References; STRING GenomicContext; STRING Homology; STRING StructureFeatures; STRING Pathways; STRING ECNumber; STRING PseudoCAPFunctionClass; STRING SubcellularLocalization; STRING GeneOntologyMappings; STRING TIGRFAMPredictions; STRING PFAMPredictions; STRING COGPredictions; STRING DNASequence; STRING ProteinSequence; UNSIGNED4 RecordID := 0; END; dsPseudomonas_In := DATASET(std.File.ExternalLogicalFilename(LandingZone_IP,Raw_PseudomonasData), L_Pseudomonas_Raw, CSV(HEADING(1), SEPARATOR(','), TERMINATOR(['\r\n','\n\r','\n','\r']), QUOTE('"'), MAXLENGTH(MaxRecordLength))); // Add a RecordID to each record - this is used for matching up the child sets and datasets after parsing. dsPseudomonas_Counted := PROJECT(dsPseudomonas_In, TRANSFORM(L_Pseudomonas_Raw, SELF.RecordID := COUNTER; SELF := LEFT;)); // Distribute the data across the cluster to make parsing faster. dsPseudomonas_Distributed := DISTRIBUTE(dsPseudomonas_Counted, RecordID); // Have to parse out the sets of strings PATTERN sep := '|'; PATTERN term := ';'; PATTERN allOther := PATTERN('[^|;]+'); RULE setOfStr := allOther OPT(term); // Record to hold strings and their associated records IDs. L_SetOfStringsRec := RECORD STRING s; UNSIGNED4 RecordID; END; // Parse each of the sets of strings. dsAlternateGeneNames := PARSE(dsPseudomonas_Distributed, AlternateGeneNames, setOfStr, TRANSFORM(L_SetOfStringsRec, SELF.s := TRIM(MATCHTEXT(allOther),LEFT,RIGHT); SELF := LEFT;), SCAN, BEST, MANY); dsAlternateProductNames := PARSE(dsPseudomonas_Distributed, AlternateProductNames, setOfStr, TRANSFORM(L_SetOfStringsRec, SELF.s := TRIM(MATCHTEXT(allOther),LEFT,RIGHT); SELF := LEFT;), SCAN, BEST, MANY); dsReferences := PARSE(dsPseudomonas_Distributed, References, setOfStr, TRANSFORM(L_SetOfStringsRec, SELF.s := TRIM(MATCHTEXT(allOther),LEFT,RIGHT); SELF := LEFT;), SCAN, BEST, MANY); dsHomology := PARSE(dsPseudomonas_Distributed, Homology, setOfStr, TRANSFORM(L_SetOfStringsRec, SELF.s := TRIM(MATCHTEXT(allOther),LEFT,RIGHT); SELF := LEFT;), SCAN, BEST, MANY); dsStructureFeatures := PARSE(dsPseudomonas_Distributed, StructureFeatures, setOfStr, TRANSFORM(L_SetOfStringsRec, SELF.s := TRIM(MATCHTEXT(allOther),LEFT,RIGHT); SELF := LEFT;), SCAN, BEST, MANY); dsPathways := PARSE(dsPseudomonas_Distributed, Pathways, setOfStr, TRANSFORM(L_SetOfStringsRec, SELF.s := TRIM(MATCHTEXT(allOther),LEFT,RIGHT); SELF := LEFT;), SCAN, BEST, MANY); // Define all the tokens that will be used for parsing the nested child datasets. TOKEN tokenNameSpace := allOther; TOKEN tokenAccession := allOther; TOKEN tokenTerm := allOther; TOKEN tokenEvidenceCode := allOther; TOKEN tokenName := allOther; TOKEN tokenFunc := allOther; TOKEN tokenIsologyType := allOther; TOKEN tokenSignificance := allOther; TOKEN tokenECNumber := allOther; TOKEN tokenRole := allOther; TOKEN tokenSubRole := allOther; TOKEN tokenStart := allOther; TOKEN tokenStop := allOther; TOKEN tokenEValue := allOther; // Parse each of the nested child datasets. RULE rGeneOntologyMapping := tokenNameSpace sep tokenAccession sep tokenTerm sep tokenEvidenceCode OPT(term); dsGeneOntologyMappings := PARSE(dsPseudomonas_Distributed, GeneOntologyMappings, rGeneOntologyMapping, TRANSFORM({L_GeneOntologyMapping,unsigned4 RecordID}, SELF.NameSpace := TRIM(MATCHTEXT(tokenNameSpace),LEFT,RIGHT); SELF.Accession := TRIM(MATCHTEXT(tokenAccession),LEFT,RIGHT); SELF.Term := TRIM(MATCHTEXT(tokenTerm),LEFT,RIGHT); SELF.EvidenceCode := TRIM(MATCHTEXT(tokenEvidenceCode),LEFT,RIGHT); SELF := LEFT;), SCAN, BEST, MANY); RULE rTIGRFAM := tokenAccession sep tokenName sep tokenFunc sep tokenIsologyType sep tokenSignificance sep tokenECNumber sep tokenRole sep tokenSubRole OPT(term); dsTIGRFAMPredictions := PARSE(dsPseudomonas_Distributed, TIGRFAMPredictions, rTIGRFAM, TRANSFORM({L_TIGRFAMPredictions,unsigned4 RecordID}, SELF.Accession := TRIM(MATCHTEXT(tokenAccession),LEFT,RIGHT); SELF.Name := TRIM(MATCHTEXT(tokenName),LEFT,RIGHT); SELF.Func := TRIM(MATCHTEXT(tokenFunc),LEFT,RIGHT); SELF.IsologyType := TRIM(MATCHTEXT(tokenIsologyType),LEFT,RIGHT); SELF.Significance := TRIM(MATCHTEXT(tokenSignificance),LEFT,RIGHT); SELF.ECNumber := TRIM(MATCHTEXT(tokenECNumber),LEFT,RIGHT); SELF.Role := TRIM(MATCHTEXT(tokenRole),LEFT,RIGHT); SELF.SubRole := TRIM(MATCHTEXT(tokenSubRole),LEFT,RIGHT); SELF := LEFT;), SCAN, BEST, MANY); RULE rPFAM := tokenAccession sep tokenName sep tokenStart sep tokenStop sep tokenEValue OPT(term); dsPFAMPredictions := PARSE(dsPseudomonas_Distributed, PFAMPredictions, rPFAM, TRANSFORM({L_PFAMPredictions,unsigned4 RecordID}, SELF.Accession := TRIM(MATCHTEXT(tokenAccession),LEFT,RIGHT); SELF.Name := TRIM(MATCHTEXT(tokenName),LEFT,RIGHT); SELF.Start := (unsigned8)TRIM(MATCHTEXT(tokenStart),LEFT,RIGHT); SELF.Stop := (unsigned8)TRIM(MATCHTEXT(tokenStop),LEFT,RIGHT); SELF.EValue := TRIM(MATCHTEXT(tokenEValue),LEFT,RIGHT); SELF := LEFT;), SCAN, BEST, MANY); RULE rCOG := tokenAccession sep tokenName OPT(term); dsCOGPredictions := PARSE(dsPseudomonas_Distributed, COGPredictions, rCOG, TRANSFORM({L_COGPredictions,unsigned4 RecordID}, SELF.Accession := TRIM(MATCHTEXT(tokenAccession),LEFT,RIGHT); SELF.Name := TRIM(MATCHTEXT(tokenName),LEFT,RIGHT); SELF := LEFT;), SCAN, BEST, MANY); // For each of the parsed sets of strings and nested child datasets, get just those that are // available for the current record and add them into the record. // This also transforms the layout from the Raw format to the final format. dsPseudomonas := PROJECT(dsPseudomonas_Distributed, TRANSFORM(L_Pseudomonas, SELF.AlternateGeneNames := SET(dsAlternateGeneNames(RecordID=LEFT.RecordID),s); SELF.AlternateProductNames := SET(dsAlternateProductNames(RecordID=LEFT.RecordID),s); SELF.References := SET(dsReferences(RecordID=LEFT.RecordID),s); SELF.Homology := SET(dsHomology(RecordID=LEFT.RecordID),s); SELF.StructureFeatures := SET(dsStructureFeatures(RecordID=LEFT.RecordID),s); SELF.Pathways := SET(dsPathways(RecordID=LEFT.RecordID),s); SELF.GeneOntologyMappings := PROJECT(dsGeneOntologyMappings(RecordID=LEFT.RecordID),TRANSFORM(L_GeneOntologyMapping, SELF:=LEFT;)); SELF.TIGRFAMPredictions := PROJECT(dsTIGRFAMPredictions(RecordID=LEFT.RecordID),TRANSFORM(L_TIGRFAMPredictions, SELF:=LEFT;)); SELF.PFAMPredictions := PROJECT(dsPFAMPredictions(RecordID=LEFT.RecordID),TRANSFORM(L_PFAMPredictions, SELF:=LEFT;)); SELF.COGPredictions := PROJECT(dsCOGPredictions(RecordID=LEFT.RecordID),TRANSFORM(L_COGPredictions, SELF:=LEFT;)); SELF := LEFT;)); // ---------------------------------------------------------------------------------------- // Just output the data to show that it was read in and parsed. output(dsPseudomonas);