/************************************************************************************* DBCountPreprocessor.java Preprocesses data ready to calculate a count diamond. No measures. Copyright (C) 2008 Hazel Webb hazel.webb@unb.ca This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . **/ /** * DBCountPreprocessor.java * input: fname: csv fact table * d: number of dimensions * [ n: normalized binary output] * and/or [ s: hash table(Key, Count) character output] * * output: 1) array file first line indicates number of dims * second line cardinalities for each dim * remaining lines counts for each attr * 2) binary file data stored as packed ints (no eol to indicate new fact) * normalizes data starting from 0 * * 3) hash table files suitable for reading by DBCount.java as an option * * It is expected that the output file will be used by DBCount_binary or DBCount to process for K. * * May 6, 2008 * changes made to standardize interface and provide optional output January 2009 * Hazel Webb ******************************************************************************************/ import java.util.*; import java.io.*; public class DBCountPreprocessor { private int d; //number of data dims private static boolean v = false; //noisy output private String file; //store input file name private Vector[]data; //data structure ragged array of Vectors to store counts for nomalized attributes private Hashtable[] mappedCoords; //ds maps attribute values to their counts private Hashtable[] mappedNorms; //ds helps to transform data to normalized integers private static boolean norm = false; //request normalized output private static boolean hash = false; //request hashtable character output public static void main(String[] args) { DBCountPreprocessor proc = new DBCountPreprocessor(); Hashtable hashFlags = new Hashtable(); //hardcode maps from flag strings to integers hashFlags.put("-filename", new Integer(0)); hashFlags.put("-f", new Integer(0)); hashFlags.put("-d", new Integer(1)); hashFlags.put("-n", new Integer(2)); hashFlags.put("-s", new Integer(3)); hashFlags.put("-h", new Integer(4)); hashFlags.put("-v", new Integer(5)); if (args.length < 4) { System.out.println("Usage: DBCountPreprocessor -f filename -d dims -n AND/OR -s [-v verbose]"); System.exit(1); } int iarg; for (int a=0; a[] for (int i = 0; i < proc.d; ++i) proc.data[i] = new Vector(); proc.mappedCoords = new Hashtable[proc.d]; proc.mappedNorms = new Hashtable[proc.d]; break; case 2: norm = true; break; case 3: hash = true; break; case 4: System.out.println("Usage: DBCountPP_binary -f filename - d dims [-v verbose]"); System.exit(0); break; case 5: v = true; } } //timing starts long startTime = System.currentTimeMillis(); proc.createHashTables(proc.file, proc.d); // createHashTables method processes for normalized or character data by checking // truth value of norm and hash if (proc.hash)proc.writeHashTables(proc.file); //only write the hash table files if requested if (proc.norm) { proc.writeArrays(proc.file); //only write the normalized files and arrays if requested proc.writeNormFile(proc.file, proc.file+".norm"); } long endTime = System.currentTimeMillis(); double time = (endTime - startTime)/1000.0; System.out.println("Time for preprocessing: " + time + " seconds"); // proc.checkFile(proc.file); //sanity check only use on small files - it prints everything } /** writeArrays @param fname: String filename write data to file 'fname.ext_array' output file is used as input to DBCount_binary.java format of output file is packed integers numberOfDimensions dim0Size dim1Size ...dimnSize attr counts for dim0; attr counts for dim1 .... attr counts for dimn **/ public void writeArrays(String fname) { fname = fname +"_array"; File file = null; try { file = new File(fname); FileOutputStream file_output = new FileOutputStream(file); // Wrap the FileOutputStream with a DataOutputStream DataOutputStream data_out = new DataOutputStream(file_output); // Write the data to the file data_out.writeInt(d); //number dims for (int i = 0; i < d; ++i) data_out.writeInt(mappedNorms[i].size()); //dim sizes for (int i=0; i < d; i++) { for (int j = 0; j < mappedNorms[i].size(); ++j) data_out.writeInt(data[i].get(j)); } // Close file when finished with it.. file_output.close(); //now make a back-up copy for use later when searching for kappa Runtime rt = Runtime.getRuntime(); String cmd = "cp "+fname+" " + fname+ "_bak" ; System.out.println("executing: " + cmd); Process p = rt.exec(cmd); } catch (IOException e) { System.out.println ("IO exception = " + e ); } } /** @param fname String file to read and print to standard out verifies execution prints values in name.array to standard out WARNING: this method prints everything. Use with care! **/ public void checkFile(String fname) { if (norm) { File file = null; int i_data =0; try { file = new File(fname+"_array"); FileInputStream file_input = new FileInputStream (file); DataInputStream data_in = new DataInputStream (file_input ); while (true) { try { i_data = data_in.readInt (); } catch (EOFException eof) { System.out.println ("End of File"); break; } // Print out the integers. System.out.printf ("%3d \t", i_data ); } data_in.close (); } catch (IOException e) { System.out.println ( "IO Exception =: " + e ); } } else System.out.println("Array option not chosen so validation not possible"); } /** createHashTables @param fname String of the csv fact table to process. CSV file should have no headers or metadata. creates d hashtables of pairs creates d hash tables of ,value(normvalue)> pairs creates array of vectors that store counts for the normalized attributes **/ public void createHashTables(String fname, int d) { BufferedReader raw; PrintWriter permanent_out; // initialize 'em happens regardless of output choice... for (int i = 0; i < d; i++) { mappedCoords[i]=new Hashtable(); mappedNorms[i] = new Hashtable(); } // map all of the attributes to their counts try { permanent_out = new PrintWriter(new BufferedWriter(new FileWriter(fname+"-string-to-norm"))); raw = new BufferedReader(new InputStreamReader(new FileInputStream(fname))); String line = raw.readLine(); Integer val = new Integer(1); Integer[] normVal = new Integer[d]; //ds to store the normalized value for the next key in each dimension for (int i = 0; i < d; ++i) normVal[i] = new Integer(0); while (line!=null) { if (v) { System.out.println("Line read: "+line); } //ensure a comma at the end of the line so split will extract all the fields if (!(line.charAt(line.length()-1)==',')) line = line + ","; String[] atts = line.split(","); for (int k = 0; k < d; ++k) { String key = atts[k]; if (hash) { if (!mappedCoords[k].containsKey(key)) { //if it wasn't mapped yet if (v) System.out.println("New Key: "+ key); mappedCoords[k].put(key,val); //add this key to mappedCoords hash } else { //increment count for this attribute Integer intVal = (Integer)mappedCoords[k].get(key); mappedCoords[k].put(key,intVal + val); //increase count of this key in mappedCoords hash } } if (norm) { if (!mappedNorms[k].containsKey(key)) { //if it wasn't mapped yet if (v) System.out.println("New Key: "+ key); mappedNorms[k].put(key,normVal[k]); //add this key to mappedNorms hash and store the relationshiop permanently permanent_out.println("dim: "+ k + "\t"+ key +"\t"+normVal[k].intValue()); data[k].add(val); //add a place holder for counts in the appropriate vector normVal[k] = (Integer)(normVal[k].intValue() + val.intValue()); //increase the current norm value for the next key seen } else { //increment count for this attribute Integer count =(Integer) mappedNorms[k].get(key); int accumulator = data[k].get(count); data[k].set(count, accumulator+1); //increase count of this key in the appropriate vector slot } } } line = raw.readLine(); } raw.close(); permanent_out.close(); } catch (IOException ie) { ie.printStackTrace(); } if (v && hash) { for (int i = 0; i < d; ++i) System.out.println(mappedCoords[i]); } } /** writeHashTables writes hashtables to file for future processing java does not guarantee order of elements returned from a HashTable, so it is necessary to run the (key,value) pairs through Enumerations in order to keep a persistent and accurate record **/ public void writeHashTables(String fname) { PrintWriter out, full_out; try { out = new PrintWriter(new BufferedWriter(new FileWriter(fname +"_COUNT"))); full_out = new PrintWriter(new BufferedWriter(new FileWriter(fname+"_COUNT.full"))); //write out metadata //number of dimensions out.print(d +"\n"); full_out.print(d+"\n"); //dimension sizes for (int index = 0; index < d; ++index) { out.print(index +" " +mappedCoords[index].size()+"\n"); full_out.print(index +" " +mappedCoords[index].size()+"\n"); } //attributes and their counts for (int index = 0; index < d; ++index) { Enumeration values = mappedCoords[index].elements(); Enumeration keys = mappedCoords[index].keys(); while (keys.hasMoreElements()) { out.print(keys.nextElement()+"\t"); out.print(values.nextElement()); out.println(); } } for (int index = 0; index < d; ++index) { Enumeration values = mappedCoords[index].elements(); Enumeration keys = mappedCoords[index].keys(); while (keys.hasMoreElements()) { full_out.print(keys.nextElement()+"\t"); full_out.print(values.nextElement()); full_out.println(); } } out.close(); full_out.close(); } catch (IOException ie) { System.out.println("inside writeDims"); ie.printStackTrace(); } } /**writeNormFile writes a normalized binary file of packed ints to writeFile @param readFile: string name of file to read @param writeFile: string name of file to write **/ public void writeNormFile(String readFile, String writeFile) { DataOutputStream pw; PrintWriter writer; BufferedReader buf; try { System.out.println("writing normalized file....."); pw = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(new File(writeFile)))); String normStringFileName = readFile.substring(0,readFile.lastIndexOf('.'))+"_normalized"; writer = new PrintWriter(new BufferedWriter(new FileWriter(normStringFileName))); //write data buf = new BufferedReader(new InputStreamReader(new FileInputStream(readFile))); String line = buf.readLine(); while (line != null) { if (!(line.charAt(line.length()-1)==',')) line = line +",";//ensure a comma at the end of the line String[] fields= line.split(","); for (int i = 0; i < d; ++i) { pw.writeInt((Integer)mappedNorms[i].get(fields[i])); //writing integers with no commas or linebreaks writer.print(mappedNorms[i].get(fields[i])+","); } writer.println(); line = buf.readLine(); } pw.close(); buf.close(); writer.close(); } catch (IOException ie) { ie.printStackTrace(); } } }