ejr.bib

@comment{{This file has been generated by bib2bib 1.99}}
@comment{{Command line: bib2bib -s $date ejr-refereed.bib ejr-bookish.bib ejr-proceedings.bib ejr-invited-pres.bib ejr-tutorials.bib ejr-tr.bib ejr-pres.bib ejr-unpub.bib ejr-tradepub.bib ejr-other.bib}}
@inproceedings{ia-cost,
  author = {Joseph N. Wilson and E. Jason Riedy},
  ejr-withauthor = {Joseph N. Wilson},
  title = {Efficient {SIMD} evaluation of image processing programs},
  booktitle = {Parallel and Distributed Methods for Image Processing},
  pages = {199--210},
  year = 1997,
  month = {July},
  dom = 28,
  editor = {Hongchi Shi and Patrick C. Coffield},
  volume = 3166,
  address = {San Diego, CA},
  organization = {SPIE},
  role = {proceedings},
  opttags = {spie; image algebra; parallel algorithms},
  file = {material/ia-cost.pdf},
  doi = {10.1117/12.279618},
  abstract = {SIMD parallel systems have been employed for image processing and computer vision applications since their inception.  This paper describes a system in which parallel programs are implemented using a machine-independent, retargetable object library that provides SIMD execution on the Lockheed Martin PAL-I SIMD parallel processor. Programs' performance on this machine is improved through on-the-fly execution analysis and scheduling. We describe the relevant elements of the system structure, the general scheme for execution analysis, and the current cost model for scheduling.},
  projtag = {image-algebra},
  keywords = {image algebra, parallel algorithm}
}
@incollection{ia-simd-chapter,
  file = {material/ia-simd-chap.pdf},
  title = {An {Image} {Algebra} Based {SIMD} Image Processing Environment},
  author = {Joseph N. Wilson and E. Jason Riedy and Gerhard X. Ritter and Hongchi Shi},
  booktitle = {Visual Information Representation, Communication, and Image Processing},
  publisher = {Marcel Dekker},
  year = {1999},
  address = {New York},
  editor = {C. W. Chen and Y. Q. Zhang},
  pages = {523--542},
  abstract = {SIMD parallel computers have been employed for image related applications since their inception. They have been leading the way in improving processing speed for those applications. However, current parallel programming technologies have not kept pace with the performance growth and cost decline of parallel hardware. A highly usable parallel software development environment is needed. This chapter presents a computing environment that integrates a SIMD mesh architecture with image algebra for high-performance image processing applications. The environment describes parallel programs through a machine-independent, retargetable image algebra object library that supports SIMD execution on the Lockheed Martin PAL-I parallel computer. Program performance on this machine is improved through on-the-fly execution analysis and scheduling. We describe the relevant elements of the system structure, outline the scheme for execution analysis, and provide examples of the current cost model and scheduling system.},
  optciteseer = {wilson97image.html},
  isbn = {082471928X},
  role = {chapter},
  keywords = {image algebra, parallel algorithm},
  file = {material/ia-simd-chap.pdf},
  projtag = {image-algebra}
}
@unpublished{tera-ubench,
  author = {E. Jason Riedy and Rich Vuduc},
  ejr-withauthor = {Rich Vuduc},
  file = {material/Tera.pdf},
  title = {Microbenchmarking the {Tera} {MTA}},
  note = {Cited},
  other-url = {http://purl.oclc.org/NET/jason-riedy/resume/material/Tera-presentation.pdf},
  dom = 21,
  month = {May},
  year = 1999,
  abstract = {The Tera Multithreaded Architecture, or MTA, addresses scalable shared memory system design with a difierent approach; it tolerates latency through providing fast access to multiple threads of execution. The MTA employs a number of radical design ideas: creation of hardware threads (streams) with frequent context switching; full-empty bits for each memory word; a flat memory hierarchy; and deep pipelines. Recent evaluations of the MTA have taken a top-down approach: port applications and application benchmarks, and compare the absolute performance with conventional systems. While useful, these studies do not reveal the effect of the Tera MTA's unique hardware features on an application. We present a bottom-up approach to the evaluation of the MTA via a suite of microbenchmarks to examine in detail the underlying hardware mechanisms and the cost of runtime system support for multithreading. In particular, we measure memory, network, and instruction latencies; memory bandwidth; the cost of low-level synchronization via full-empty bits; overhead for stream management; and the effects of software pipelining. These data should provide a foundation for performance modeling on the MTA. We also present results for list ranking on the MTA, an application which has traditionally been difficult to scale on conventional parallel systems.},
  role = {unpublished},
  opttags = {parallel programming; parallel algorithms; multithreaded; computer architecture; cray},
  projtag = {cassmt},
  keywords = {parallel algorithm, novel architecture, memory-centric},
  ejr-proj = {novel-arch}
}
@unpublished{power-control,
  author = {E. Jason Riedy and Robert Szewczyk},
  ejr-withauthor = {Robert Szewczyk},
  title = {Power and Control in Networked Sensors},
  note = {Cited},
  month = {May},
  dom = 11,
  year = 2000,
  file = {material/power-and-control.pdf},
  role = {unpublished},
  opttags = {sensor network},
  abstract = {The fundamental constraint on a networked sensor is its energy consumption, since it may be either impossible or not feasible to replace its energy source.  We analyze the power dissipation implications of implementing the network sensor with either a central processor switching between I/O devices or a family of processors, each dedicated to a single device.  We present the energy measurements of the current generations of networked sensors, and develop an abstract description of tradeoffs between both designs.},
  citeseer = {riedy00power.html},
  keywords = {embedded, sensor, {IoT}, novel architecture}
}
@unpublished{fp-type-project,
  author = {E. Jason Riedy},
  title = {Type System Support for Floating-Point Computation},
  month = {May},
  dom = 25,
  file = {material/type-support-for-fp.pdf},
  abstract = {Floating-point arithmetic is often seen as untrustworthy. We show how manipulating precisions according to the following rules of thumb enhances the reliability of and removes surprises from calculations: Store data narrowly, compute intermediates widely, and derive properties widely.  Further, we describe a typing system for floating point that both supports and is supported by these rules. A single type is established for all in- termediate computations. The type describes a precision at least as wide as all inputs to and results from the computation. Picking a single type provides benefits to users, compilers, and interpreters. The type system also extends cleanly to encompass intervals and higher precisions.},
  year = 2001,
  role = {unpublished},
  opttags = {programming language; floating point; ieee754},
  projtag = {ieee754},
  keywords = {floating point, ieee754},
  ejr-proj = {floating-point}
}
@misc{bascd2002-poster,
  author = {E. Jason Riedy},
  title = {Parallel Bipartite Matching for Sparse Matrix Computation},
  howpublished = {Third Bay Area Scientific Computing Day},
  month = {March},
  year = 2002,
  address = {Livermore, CA},
  role = {poster},
  opttags = {bascd; sparse matrix; combinatorial optimization; parallel algorithms},
  projtag = {sparse-methods},
  keywords = {sparse matrix, parallel algorithm, graph analysis},
  ejr-proj = {linear-algebra, graph-analysis}
}
@misc{ieee754-exceptions,
  author = {David Bindel and E. Jason Riedy},
  ejr-withauthor = {David Bindel},
  title = {Exception Handling Interfaces, Implementations, and Evaluation},
  howpublished = {IEEE-754r revision meeting},
  month = {August},
  year = 2002,
  url = {http://grouper.ieee.org/groups/754/meeting-materials/2002-08-22-pres.pdf},
  optrole = {presentation},
  opttags = {ieee754; floating point},
  file = {material/ieee754-2002-08-22-pres.pdf},
  projtag = {ieee754},
  keywords = {floating point, ieee754},
  ejr-proj = {floating-point}
}
@misc{siam-cse03,
  author = {E. Jason Riedy},
  title = {Parallel Bipartite Matching for Sparse Matrix Computations},
  howpublished = {SIAM Conference on Computational Science and Engineering},
  month = {February},
  year = 2003,
  role = {poster},
  opttags = {siam; parallel algorithms; combinatorial optimization; sparse matrix},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/siam-cse03-poster.pdf},
  file = {material/siam-cse03-poster.pdf},
  abstract = {Practical and efficient methods exist for parallelizing the numerical work in sparse matrix calculations. The initial symbolic analysis is now becoming a sequential bottleneck, limiting problems' sizes. One such analysis is the weighted bipartite matching used to achieve scalable, unsymmetric $LU$ factorization in Super\textsc{lu}. Applying a mathematical optimization algorithm produces a distributed-memory implementation with explicit trade-offs between speed and matching quality. We present accuracy and performance results for this phase alone and in the context of Super\textsc{lu}.},
  projtag = {sparse-methods},
  keywords = {sparse matrix, parallel algorithm, linear algebra, graph analysis},
  ejr-proj = {linear-algebra, graph-analysis}
}
@misc{siam-am03,
  author = {E. Jason Riedy},
  title = {Practical Alternatives for Parallel Pivoting},
  howpublished = {SIAM Annual Meeting},
  month = {June},
  year = 2003,
  optrole = {presentation},
  opttags = {siam; sparse matrix; linear algebra},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/siam-am03.pdf},
  file = {material/siam-am03.pdf},
  abstract = {Traditional pivoting during parallel, unsymmetric $LU$ factorization introduces heavy communication and restructuring costs. Possible alternatives include pre-pivoting to place heavy elements along the diagonal and limited pivoting that maintains the factors' structures. Each alternative comes with trade-offs that affect accuracy and performance.},
  projtag = {sparse-methods},
  keywords = {sparse matrix, linear algebra, parallel algorithm, graph analysis},
  ejr-proj = {linear-algebra, graph-analysis}
}
@misc{sparse-ds-csc04,
  author = {E. Jason Riedy},
  title = {Sparse Data Structures for Weighted Bipartite Matching},
  howpublished = {SIAM Workshop on Combinatorial Scientific Computing},
  dom = 28,
  month = {February},
  year = 2004,
  optrole = {presentation},
  opttags = {siam; combinatorial optimization; sparse matrix},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/csc04.pdf},
  file = {material/csc04.pdf},
  projtag = {sparse-methods},
  keywords = {sparse matrix, graph analysis},
  ejr-proj = {linear-algebra, graph-analysis}
}
@misc{par-bipart-pp04,
  author = {E. Jason Riedy},
  title = {Parallel Weighted Bipartite Matching and Applications},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = 27,
  month = {February},
  year = 2004,
  optrole = {minisymposium speaker},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/pp04.pdf},
  file = {material/pp04.pdf},
  optrole = {presentation},
  opttags = {siam; combinatorial optimization; parallel algorithms; sparse matrix},
  abstract = {Bipartite matching is one of graph theory's workhorses, occuring in the solution or approximation of many problems. Increasingly, applications' data spans multiple memory spaces, but there is little recent experience with distributed matching algorithms. We present a distributed, parallel implementation for weighted bipartite matching based on Bertsekas's auction algorithm. The bidding process finds local matchings while summarizing updates for occasional communication, leading to superlinear speed-ups on some sparse problems and modest performance on others.},
  projtag = {sparse-methods},
  keywords = {sparse matrix, parallel algorithm, graph analysis},
  ejr-proj = {linear-algebra, graph-analysis}
}
@techreport{axb-itref-lawn,
  author = {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee and E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee},
  title = {Error bounds from extra-precise iterative refinement},
  type = {LAPACK Working Note},
  institution = {Netlib},
  year = 2005,
  number = 165,
  lawn = 165,
  month = {February},
  dom = 3,
  role = {techreport},
  opttags = {lawn; lapack; linear algebra; floating point},
  other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html},
  note = {Also issued as UCB//CSD-05-1414, UT-CS-05-547, and LBNL-56965; expanded from TOMS version},
  url = {http://www.netlib.org/lapack/lawnspdf/lawn165.pdf},
  file = {material/lawn165.pdf},
  abstract = {We present the design and testing of an algorithm for iterative refinement of the solution of linear equations, where the residual is computed with extra precision. This algorithm was originally proposed in the 1960s [6, 22] as a means to compute very accurate solutions to all but the most ill-conditioned linear systems of equations. However two obstacles have until now prevented its adoption in standard subroutine libraries like LAPACK: (1) There was no standard way to access the higher precision arithmetic needed to compute residuals, and (2) it was unclear how to compute a reliable error bound for the computed solution. The completion of the new BLAS Technical Forum Standard [5] has recently removed the first obstacle. To overcome the second obstacle, we show how a single application of iterative refinement can be used to compute an error bound in any norm at small cost, and use this to compute both an error bound in the usual infinity norm, and a componentwise relative error bound.  We report extensive test results on over 6.2 million matrices of dimension 5, 10, 100, and 1000.  As long as a normwise (resp. componentwise) condition number computed by the algorithm is less than $1 / \operatorname{max}\{10, \sqrt{n}\}\varepsilon_w$ , the computed normwise (resp. componentwise) error bound is at most $2 \operatorname{max}\{10, \sqrt{n}\} \cdot \varepsilon_w$ , and indeed bounds the true error. Here, $n$ is the matrix dimension and $\varepsilon_w$ is single precision roundoff error. For worse conditioned problems, we get similarly small correct error bounds in over 89.4\% of cases.},
  projtag = {lapack, ieee754}
}
@misc{comb-sparse-cse05,
  author = {E. Jason Riedy},
  title = {Parallel Combinatorial Computing and Sparse Matrices},
  howpublished = {SIAM Conference on Computational Science and Engineering},
  dom = 14,
  month = {February},
  year = 2005,
  optrole = {minisymposium speaker},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/cse05.pdf},
  file = {material/cse05.pdf},
  optrole = {presentation},
  opttags = {combinatorial optimization; sparse matrix; parallel algorithms; siam},
  abstract = {Increasingly, sparse matrix applications produce matrices too large for a single computer's memory. Distributed, parallel computers provide an avenue around memory limitations, but distributing combinatorial algorithms is historically difficult. We use insights from combinatorial optimization to design loosely coupled algorithms for sparse matrix matching, ordering, and symbolic factorization. These algorithms' performance depends on both problem instance and computer architecture. We investigate these aspects of performance and demonstrate issues that affect distributed combinatorial computing.},
  projtag = {sparse-methods},
  keywords = {sparse matrix, parallel algorithm, graph analysis},
  ejr-proj = {linear-algebra, graph-analysis}
}
@inproceedings{arith-lang,
  author = {David Hough and Bill Hay and Jeff Kidder and E. Jason Riedy and Guy L. Steele Jr. and Jim Thomas},
  ejr-withauthor = {David Hough and Bill Hay and Jeff Kidder and Guy L. Steele Jr. and Jim Thomas},
  title = {Arithmetic Interactions: From Hardware to Applications},
  booktitle = {17th {IEEE} Symposium on Computer Arithmetic ({ARITH}'05)},
  year = 2005,
  dom = 28,
  month = {June},
  note = {See \href{http://purl.oclc.org/NET/jason-riedy/resume/material/arith17-slides.pdf}{related presentation}},
  isbn = {0-7695-2366-8},
  role = {proceedings; panel},
  opttags = {ieee754; floating point},
  doi = {10.1109/ARITH.2005.10},
  abstract = {The entire process of creating and executing applications that solve interesting problems with acceptable cost and accuracy involves a complex interaction among hardware, system software, programming environments, mathematical software libraries, and applications software, all mediated by standards for arithmetic, operating systems, and programming environments. This panel will discuss various issues arising among these various contending points of view, sometimes from the point of view of issues raised during the current IEEE 754R standards revision effort.},
  projtag = {ieee754},
  keywords = {ieee754, floating point},
  ejr-proj = {floating-point}
}
@misc{lang-tools-ieee754,
  author = {E. Jason Riedy},
  title = {Modern Language Tools and {754R}},
  howpublished = {{ARITH}'05},
  month = {June},
  dom = 28,
  year = 2005,
  role = {panel participant},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/arith17-slides.pdf},
  file = {material/arith17-slides.pdf},
  role = {presentation},
  opttags = {ieee754; floating point},
  note = {Invited presentation and panelist},
  projtag = {lapack, ieee754},
  keywords = {linear algebra, sparse matrix, foating point, lapack},
  ejr-proj = {linear-algebra}
}
@techreport{tridiag-lawn,
  author = {Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel},
  ejr-withauthor = {Osni A. Marques and Christof V{\"o}mel},
  title = {Benefits of {IEEE-754} Features in Modern Symmetric Tridiagonal Eigensolvers},
  type = {LAPACK Working Note},
  number = 172,
  lawn = 172,
  institution = {Netlib},
  month = {September},
  year = 2005,
  dom = 30,
  note = {Also issued as UCB//CSD-05-1414; expanded from SISC version},
  url = {http://www.netlib.org/lapack/lawnspdf/lawn172.pdf},
  other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2005/6514.html},
  role = {techreport},
  opttags = {lawn; lapack; floating point; ieee754; eigenvalue},
  file = {material/lawn172.pdf},
  abstract = {Bisection is one of the most common methods used to compute the eigenvalues of symmetric tridiagonal matrices. Bisection relies on the Sturm count: For a given shift sigma, the number of negative pivots in the factorization $T - \sigma I = LDL^T$ equals the number of eigenvalues of T that are smaller than sigma. In IEEE-754 arithmetic, the value $\infty$ permits the computation to continue past a zero pivot, producing a correct Sturm count when $T$ is unreduced. Demmel and Li showed [IEEE Trans. Comput., 43 (1994), pp. 983–992] that using $\infty$ rather than testing for zero pivots within the loop could significantly improve performance on certain architectures. When eigenvalues are to be computed to high relative accuracy, it is often preferable to work with $LDL^T$ factorizations instead of the original tridiagonal $T$. One important example is the MRRR algorithm. When bisection is applied to the factored matrix, the Sturm count is computed from $LDL^T$ which makes differential stationary and progressive qds algorithms the methods of choice. While it seems trivial to replace $T$ by $LDL^T$, in reality these algorithms are more complicated: In IEEE-754 arithmetic, a zero pivot produces an overflow followed by an invalid exception (NaN, or ``Not a Number'') that renders the Sturm count incorrect. We present alternative, safe formulations that are guaranteed to produce the correct result. Benchmarking these algorithms on a variety of platforms shows that the original formulation without tests is always faster provided that no exception occurs. The transforms see speed-ups of up to 2.6x over the careful formulations. Tests on industrial matrices show that encountering exceptions in practice is rare. This leads to the following design: First, compute the Sturm count by the fast but unsafe algorithm. Then, if an exception occurs, recompute the count by a safe, slower alternative. The new Sturm count algorithms improve the speed of bisection by up to 2x on our test matrices. Furthermore, unlike the traditional tiny-pivot substitution, proper use of IEEE-754 features provides a careful formulation that imposes no input range restrictions.},
  projtag = {lapack, ieee754}
}
@misc{lapack-future,
  author = {E. Jason Riedy and Yozo Hida and James W. Demmel},
  ejr-withauthor = {Yozo Hida and James W. Demmel},
  title = {The Future of {LAPACK} and {ScaLAPACK}},
  howpublished = {Robert C. Thompson Matrix Meeting},
  dom = 18,
  month = {November},
  year = 2005,
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/future-of-scalapack.pdf},
  file = {material/future-of-scalapack.pdf},
  optrole = {presentation},
  opttags = {lapack; software engineering},
  abstract = {We are planning new releases of the widely used LAPACK and ScaLAPACK numerical linear algebra libraries. Based on an on-going user survey (http://www.netlib.org/lapack-dev) and research by many people, we are proposing the following improvements: Faster algorithms (including better numerical methods, memory hierarchy optimizations, parallelism, and automatic performance tuning to accomodate new architectures), more accurate algorithms (including better numerical methods, and use of extra precision), expanded functionality (including updating and downdating, new eigenproblems, etc. and putting more of LAPACK into ScaLAPACK), and improved ease of use (friendlier interfaces in multiple languages). To accomplish these goals we are also relying on better software engineering techniques and contributions from collaborators at many institutions. This is joint work with Jack Dongarra.},
  projtag = {lapack, ieee754},
  keywords = {lapack, linear algebra, floating point},
  ejr-proj = {linear-algebra, floating-point}
}
@misc{bascd2006-poster,
  author = {E. Jason Riedy},
  title = {Making Static Pivoting Dependable},
  howpublished = {Seventh Bay Area Scientific Computing Day},
  month = {March},
  year = 2006,
  role = {poster},
  address = {Livermore, CA},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/bascd2006-poster.pdf},
  file = {material/bascd2006-poster.pdf},
  optrole = {poster},
  opttags = {bascd; sparse matrix; linear algebra},
  abstract = {For sparse LU factorization, dynamic pivoting tightly couples symbolic and numerical computation. Dynamic structural changes limit parallel scalability.  Demmel and Li use static pivoting in distributed SuperLU for performance, but intentionally perturbing the input may lead silently to erroneous results.  Are there experimentally stable static pivoting heuristics that lead to a dependable direct solver? The answer is currently a qualified yes. Current heuristics fail on a few systems, but all failures are detectable.},
  projtag = {lapack, sparse-methods},
  keywords = {sparse matrix, linear algebra, floating point, graph analysis},
  ejr-proj = {linear-algebra, floating-point, graph-analysis}
}
@article{axb-itref-toms,
  author = {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee and E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee},
  title = {Error bounds from extra-precise iterative refinement},
  journal = {{ACM} Transactions on Mathematical Software},
  year = 2006,
  volume = 32,
  number = 2,
  pages = {325--351},
  month = {June},
  role = {refereed},
  opttags = {acm; toms; lapack; floating point; linear algebra},
  doi = {10.1145/1141885.1141894},
  issn = {0098-3500},
  mrclass = {65F10},
  mrnumber = 2272365,
  abstract = {We present the design and testing of an algorithm for iterative refinement of the solution of linear equations where the residual is computed with extra precision. This algorithm was originally proposed in 1948 and analyzed in the 1960s as a means to compute very accurate solutions to all but the most ill-conditioned linear systems. However, two obstacles have until now prevented its adoption in standard subroutine libraries like LAPACK: (1) There was no standard way to access the higher precision arithmetic needed to compute residuals, and (2) it was unclear how to compute a reliable error bound for the computed solution. The completion of the new BLAS Technical Forum Standard has essentially removed the first obstacle. To overcome the second obstacle, we show how the application of iterative refinement can be used to compute an error bound in any norm at small cost and use this to compute both an error bound in the usual infinity norm, and a componentwise relative error bound.},
  gt-role = {Assisted in software development, experimental methods, writing, and editing.},
  file = {material/gesvxx.pdf},
  projtag = {lapack, ieee754},
  keywords = {lapack, ieee754, floating point, linear algebra},
  ejr-proj = {linear-algebra, floating-point}
}
@inproceedings{lapack-prospectus,
  author = {James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov},
  ejr-withauthor = {James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov},
  title = {Prospectus for the Next {LAPACK} and {ScaLAPACK} Libraries},
  booktitle = {{PARA'06}: State-of-the-Art in Scientific and Parallel Computing},
  year = 2006,
  address = {Ume{\aa}, Sweden},
  month = {June},
  organization = {High Performance Computing Center North ({HPC2N}) and the Department of Computing Science, Ume{\aa} University},
  publisher = {Springer},
  role = {proceedings},
  opttags = {lapack},
  url = {http://www.netlib.org/utk/people/JackDongarra/PAPERS/para06-lapack.pdf},
  abstract = {LAPACK and ScaLAPACK are widely used software libraries for numerical linear algebra. There have been over 68M web hits at www.netlib.org for the associated libraries LAPACK, ScaLAPACK, CLAPACK and LAPACK95. LAPACK and ScaLAPACK are used to solve leading edge science problems and they have been adopted by many vendors and software providers as the basis for their own libraries, including AMD, Apple (under Mac OS X), Cray, Fujitsu, HP, IBM, Intel, NEC, SGI, several Linux distributions (such as Debian), NAG, IMSL, the MathWorks (producers of MATLAB), Interactive Supercomputing, and PGI.  Future improvements in these libraries will therefore have a large impact on users.},
  doi = {10.1007/978-3-540-75755-9\_2},
  file = {material/lapack-prospectus.pdf},
  projtag = {lapack},
  keywords = {lapack, linear algebra, floating point},
  ejr-proj = {linear-algebra, floating-point}
}
@unpublished{lapack-style,
  author = {Jack Dongarra and Julien Langou and E. Jason Riedy},
  ejr-withauthor = {Jack Dongarra and Julien Langou},
  title = {Sca/{LAPACK} Program Style},
  month = {August},
  year = 2006,
  role = {unpublished},
  opttags = {lapack},
  url = {http://www.netlib.org/lapack-dev/lapack-coding/program-style.html},
  abstract = {The purpose of this document is to facilitate contributions to LAPACK and ScaLAPACK by documenting their design and implementation guidelines. The long-term goal is to provide guidelines for both LAPACK and ScaLAPACK. However, the parallel ScaLAPACK code has more open issues, so this document primarily concerns LAPACK.},
  projtag = {lapack},
  keywords = {linear algebra, lapack, blas},
  ejr-proj = {linear-algebra}
}
@article{tridiag-sisc,
  author = {Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel},
  ejr-withauthor = {Osni A. Marques and Christof V{\"o}mel},
  title = {Benefits of {IEEE-754} Features in Modern Symmetric Tridiagonal Eigensolvers},
  journal = {SIAM Journal on Scientific Computing},
  year = 2006,
  month = {September},
  dom = 28,
  volume = 28,
  number = 5,
  pages = {1613--1633},
  role = {refereed},
  opttags = {siam; sisc; floating point; eigenvalue; ieee754},
  doi = {10.1137/050641624},
  issn = {1064-8275},
  mrclass = {65F15},
  mrnumber = 2272181,
  abstract = {Bisection is one of the most common methods used to compute the eigenvalues of symmetric tridiagonal matrices. Bisection relies on the Sturm count: For a given shift sigma, the number of negative pivots in the factorization $T - \sigma I = LDL^T$ equals the number of eigenvalues of T that are smaller than sigma. In IEEE-754 arithmetic, the value $\infty$ permits the computation to continue past a zero pivot, producing a correct Sturm count when $T$ is unreduced. Demmel and Li showed [IEEE Trans. Comput., 43 (1994), pp. 983–992] that using $\infty$ rather than testing for zero pivots within the loop could significantly improve performance on certain architectures. When eigenvalues are to be computed to high relative accuracy, it is often preferable to work with $LDL^T$ factorizations instead of the original tridiagonal $T$. One important example is the MRRR algorithm. When bisection is applied to the factored matrix, the Sturm count is computed from $LDL^T$ which makes differential stationary and progressive qds algorithms the methods of choice. While it seems trivial to replace $T$ by $LDL^T$, in reality these algorithms are more complicated: In IEEE-754 arithmetic, a zero pivot produces an overflow followed by an invalid exception (NaN, or ``Not a Number'') that renders the Sturm count incorrect. We present alternative, safe formulations that are guaranteed to produce the correct result. Benchmarking these algorithms on a variety of platforms shows that the original formulation without tests is always faster provided that no exception occurs. The transforms see speed-ups of up to 2.6x over the careful formulations. Tests on industrial matrices show that encountering exceptions in practice is rare. This leads to the following design: First, compute the Sturm count by the fast but unsafe algorithm. Then, if an exception occurs, recompute the count by a safe, slower alternative. The new Sturm count algorithms improve the speed of bisection by up to 2x on our test matrices. Furthermore, unlike the traditional tiny-pivot substitution, proper use of IEEE-754 features provides a careful formulation that imposes no input range restrictions.},
  gt-role = {Principal author and developer.},
  file = {material/ieee754-tridiag.pdf},
  projtag = {lapack},
  keywords = {lapack, ieee754, floating point, linear algebra},
  ejr-proj = {linear-algebra, floating-point}
}
@techreport{lapack-prospectus-lawn,
  author = {James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov},
  ejr-withauthor = {James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov},
  title = {Prospectus for the Next {LAPACK} and {ScaLAPACK} Libraries},
  institution = {Netlib},
  year = 2007,
  type = {LAPACK Working Note},
  number = 181,
  lawn = 181,
  month = {February},
  note = {Also issued as UT-CS-07-592},
  role = {techreport},
  opttags = {lawn; lapack},
  url = {http://www.netlib.org/lapack/lawnspdf/lawn181.pdf},
  file = {material/lawn181.pdf},
  projtag = {lapack}
}
@misc{bascd2007-poster,
  author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and E. Jason Riedy and Meghana Vishvanath and David Vu},
  ejr-withauthor = {James W. Demmel and Yozo Hida and Xiaoye S. Li and Meghana Vishvanath and David Vu},
  title = {Precise Solutions for Overdetermined Least Squares Problems},
  howpublished = {Stanford 50 -- Eighth Bay Area Scientific Computing Day},
  month = {March},
  year = 2007,
  role = {poster},
  address = {Stanford, CA},
  optrole = {poster},
  opttags = {bascd; least squares},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/bascd2007-poster.pdf},
  abstract = {Linear least squares (LLS) fitting is the most widely used data modeling technique and is included in almost every data analysis system (e.g. spreadsheets). These software systems often give no feedback on the conditioning of the LLS problem or the floating-point calculation errors present in the solution.  With limited use of extra precision, we can eliminate these concerns for all but the most ill-conditioned LLS problems. Our algorithm provides either a solution and residual with relatively tiny error or a notice that the LLS problem is too ill-conditioned.},
  projtag = {lapack, ieee754},
  keywords = {least squares, lapack, blas, linear algebra, floating point},
  ejr-proj = {linear-algebra, floating-point}
}
@techreport{lawn188,
  author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Yozo Hida and Xiaoye S. Li},
  title = {Extra-precise iterative refinement for overdetermined least squares problems},
  type = {LAPACK Working Note},
  institution = {Netlib},
  year = 2007,
  number = 188,
  month = {May},
  dom = 31,
  url = {http://www.netlib.org/lapack/lawnspdf/lawn188.pdf},
  file = {material/lawn188.pdf},
  other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html},
  note = {Also issued as UCB/EECS-2007-77; version accepted for TOMS.},
  abstract = {We present the algorithm, error bounds, and numerical results for extra-precise iterative refinement applied to overdetermined linear least squares (LLS) problems. We apply our linear system refinement algorithm to Bj{\"o}rck’s augmented linear system formulation of an LLS problem. Our algorithm reduces the forward normwise and componentwise errors to $O(\varepsilon)$ unless the system is too ill conditioned. In contrast to linear systems, we provide two separate error bounds for the solution $x$ and the residual $r$. The refinement algorithm requires only limited use of extra precision and adds only $O(mn)$ work to the $O(mn^2)$ cost of QR factorization for problems of size m-by-n. The extra precision calculation is facilitated by the new extended-precision BLAS standard in a portable way, and the refinement algorithm will be included in a future release of LAPACK and can be extended to the other types of least squares problems.},
  role = {techreport},
  opttags = {lawn; lapack; least squares; floating point},
  projtag = {lapack, ieee754}
}
@techreport{nonneg-house-lawn,
  author = {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida and E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida},
  title = {Non-Negative Diagonals and High Performance on Low-Profile Matrices from {Householder} {$QR$}},
  institution = {Netlib},
  year = 2008,
  type = {LAPACK Working Note},
  number = 203,
  lawn = 203,
  month = {May},
  dom = 30,
  note = {Also issued as UCB/EECS-2008-76; modified from SISC version.},
  role = {techreport},
  opttags = {lawn; lapack; householder; qr},
  url = {http://www.netlib.org/lapack/lawnspdf/lawn203.pdf},
  other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2008/EECS-2008-76.html},
  file = {material/lawn203.pdf},
  abstract = {The Householder reflections used in LAPACK's $QR$ factorization leave positive and negative real entries along $R$'s diagonal. This is sufficient for most applications of $QR$ factorizations, but a few require that $R$ have a nonnegative diagonal. This note describes a new Householder generation routine to produce a nonnegative diagonal. Additionally, we find that scanning for trailing zeros in the generated reflections leads to large performance improvements when applying reflections with many trailing zeros. Factoring low-profile matrices, those with nonzero entries mostly near the diagonal (e.g., band matrices), now require far fewer operations. For example, $QR$ factorization of matrices with profile width b that are stored densely in an $n\times n$ matrix improves from $O(n^3)$ to $O(n^2+nb^2)$. These routines are in LAPACK 3.2.},
  projtag = {lapack}
}
@techreport{ieee754-2008,
  note = {(committee member and contributor)},
  author = {{IEEE 754 Committee}},
  key = {IEEE Std 754-2008},
  journal = {IEEE Std 754-2008},
  type = {IEEE Std},
  number = {754-2008},
  title = {{IEEE} Standard for Floating-Point Arithmetic},
  year = 2008,
  pages = {1 -- 70},
  institution = {Microprocessor Standards Committee of the IEEE Computer Society},
  abstract = {This standard specifies interchange and arithmetic formats and methods for binary and decimal floating-point arithmetic in computer programming environments. This standard specifies exception conditions and their default handling. An implementation of a floating-point system conforming to this standard may be realized entirely in software, entirely in hardware, or in any combination of software and hardware. For operations specified in the normative part of this standard, numerical results and exceptions are uniquely determined by the values of the input data, sequence of operations, and destination formats, all under user control.},
  keywords = {IEEE standards;floating point arithmetic;programming;IEEE standard;arithmetic formats;computer programming;decimal floating-point arithmetic;754-2008;NaN;arithmetic;binary;computer;decimal;exponent;floating-point;format;interchange;number;rounding;significand;subnormal},
  doi = {10.1109/IEEESTD.2008.4610935},
  isbn = {978-0-7381-5753-5},
  address = {New York, NY},
  month = {August},
  dom = 29,
  committee = {Alex Aiken and Matthew Applegate and David Bailey and Steve Bass and Dileep Bhandarkar and Mahesh Bhat and David Bindel and Sylvie Boldo and Stephen Canon and Steven R. Carlough and Marius Cornea and Mike Cowlishaw (editor) and John H. Crawford and Joseph D. Darcy and Marc Daumas and Bob Davis and Mark Davis and Dick Delp and Jim Demmel and Mark A. Erle and Hossam A. H. Fahmy and J.P. Fasano and Richard Fateman and Eric Feng and Warren E. Ferguson and Alex Fit-Florea and Laurent Fournier and Chip Freitag and Ivan Godard and Roger A. Golliver and David Gustafson and Michel Hack and John R. Harrison and John Hauser and Yozo Hida and Chris N. Hinds and Graydon Hoare and David G. Hough and Jerry Huck and Jim Hull and Michael Ingrassia and David V. James and Rick James and William Kahan and John Kapernick and Richard Karpinski and Jeff Kidder and Plamen Koev and Ren-Cang Li and Zhishun Alex Liu and Raymond Mak and Peter Markstein and David Matula and Guillaume Melquiond and Nobuyoshi Mori and Ricardo Morin and Ned Nedialkov and Craig Nelson and Stuart Oberman and Jon Okada and Ian Ollmann and Michael Parks and Tom Pittman and Eric Postpischil and Jason Riedy and Debjit Das Sarma and Eric M. Schwarz and David Scott and Don Senzig and Ilya Sharapov and Jim Shearer and Michael Siu and Ron Smith and Chuck Stevens and Peter Tang and Pamela J. Taylor and James W. Thomas and Brandon Thompson and Wendy Thrash and Neil Toda and Son Dao Trong and Leonard Tsai and Charles Tsen and Fred Tydeman and Liang Kai Wang and Scott Westbrook and Steve Winkler and Anthony Wood and Umit Yalcinalp and Fred Zemke and Paul Zimmermann and Dan Zuras (chair)},
  projtag = {ieee754}
}
@misc{cerfacs08,
  file = {material/cerfacs08.pdf},
  author = {E. Jason Riedy},
  title = {Auctions for Distributed (and Possibly Parallel) Matchings},
  howpublished = {Visit to \href{http://www.cerfacs.fr/}{CERFACS} courtesy of the Franco-Berkeley Fund},
  dom = 17,
  month = {December},
  year = 2008,
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/cerfacs08.pdf},
  opttags = {cerfacs; combinatorial optimization; sparse matrix},
  note = {Invited presentation},
  projtag = {sparse-methods},
  keywords = {linear algebra, sparse matrix, foating point, lapack},
  ejr-proj = {linear-algebra}
}
@article{lsq-itref-toms,
  author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Yozo Hida and Xiaoye S. Li},
  title = {Extra-precise iterative refinement for overdetermined least squares problems},
  journal = {{ACM} Transactions on Mathematical Software},
  volume = 35,
  number = 4,
  year = 2009,
  month = {February},
  issn = {0098-3500},
  pages = {1--32},
  doi = {10.1145/1462173.1462177},
  accepted = {25 June 2008},
  role = {refereed},
  opttags = {acm; toms; lapack; floating point; linear algebra},
  abstract = {We present the algorithm, error bounds, and numerical results for extra-precise iterative refinement applied to overdetermined linear least squares (LLS) problems. We apply our linear system refinement algorithm to Bj{\"o}rck’s augmented linear system formulation of an LLS problem. Our algorithm reduces the forward normwise and componentwise errors to $O(\varepsilon)$ unless the system is too ill conditioned. In contrast to linear systems, we provide two separate error bounds for the solution $x$ and the residual $r$. The refinement algorithm requires only limited use of extra precision and adds only $O(mn)$ work to the $O(mn^2)$ cost of QR factorization for problems of size m-by-n. The extra precision calculation is facilitated by the new extended-precision BLAS standard in a portable way, and the refinement algorithm will be included in a future release of LAPACK and can be extended to the other types of least squares problems.},
  gt-role = {Assisted in software development, experimental methods, writing, and editing.},
  file = {material/lsq_itrefx.pdf},
  projtag = {lapack, ieee754},
  keywords = {lapack, ieee754, floating point, linear algebra},
  ejr-proj = {linear-algebra, floating-point}
}
@article{nonneg-house-lawn-sisc,
  author = {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida and E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida},
  title = {Non-Negative Diagonals and High Performance on Low-Profile Matrices from {H}ouseholder {$QR$}},
  publisher = {SIAM},
  year = 2009,
  month = {July},
  dom = 3,
  journal = {SIAM Journal on Scientific Computing},
  volume = 31,
  number = 4,
  pages = {2832--2841},
  keywords = {LAPACK; QR factorization; Householder reflection; floating-point},
  doi = {10.1137/080725763},
  role = {refereed},
  opttags = {siam; sisc; lapack; householder; qr},
  issn = {1064-8275},
  mrclass = {65F30},
  mrnumber = 2520301,
  gt-role = {Principal author and developer.},
  abstract = {The Householder reflections used in LAPACK's $QR$ factorization leave positive and negative real entries along $R$'s diagonal. This is sufficient for most applications of $QR$ factorizations, but a few require that $R$ have a nonnegative diagonal. This note describes a new Householder generation routine to produce a nonnegative diagonal. Additionally, we find that scanning for trailing zeros in the generated reflections leads to large performance improvements when applying reflections with many trailing zeros. Factoring low-profile matrices, those with nonzero entries mostly near the diagonal (e.g., band matrices), now require far fewer operations. For example, $QR$ factorization of matrices with profile width b that are stored densely in an $n\times n$ matrix improves from $O(n^3)$ to $O(n^2+nb^2)$. These routines are in LAPACK 3.2.},
  file = {material/lawn203.pdf},
  projtag = {lapack},
  keywords = {lapack, linear algebra},
  ejr-proj = {linear-algebra}
}
@misc{gt09,
  file = {material/gt-2009-08-21.pdf},
  author = {E. Jason Riedy},
  title = {Dependable direct solutions for linear systems using a little extra precision},
  howpublished = {CSE Seminar at Georgia Institute of Technology},
  dom = 21,
  month = {August},
  year = 2009,
  url = {http://hdl.handle.net/1853/29795},
  opttags = {linear algebra; floating point; lapack},
  note = {Invited presentation},
  abstract = {Solving a square linear system $Ax=b$ often is considered a black box. It's supposed to "just work," and failures often are blamed on the original data or subtleties of floating-point. Now that we have an abundance of cheap computations, however, we can do much better. A little extra precision in just the right places produces accurate solutions cheaply or demonstrates when problems are too hard to solve without significant cost. This talk will outline the method, iterative refinement with a new twist; the benefits, small backward and forward errors; and the trade-offs and unexpected benefits.},
  projtag = {lapack, sparse-methods, ieee754},
  keywords = {linear algebra, sparse matrix, foating point, lapack},
  ejr-proj = {linear-algebra}
}
@inproceedings{mtaap10,
  author = {David Ediger and Karl Jiang and E. Jason Riedy and David A. Bader},
  ejr-withauthor = {David Ediger and Karl Jiang and David A. Bader},
  title = {Massive Streaming Data Analytics: A Case Study with Clustering Coefficients},
  booktitle = {4th Workshop on Multithreaded Architectures and Applications (MTAAP)},
  role = {proceedings},
  opttags = {parallel; graph; streaming},
  year = 2010,
  address = {Atlanta, GA},
  month = {April},
  dom = 23,
  acc-note = {(11/22 papers accepted, 50\% acceptance rate)},
  doi = {10.1109/IPDPSW.2010.5470687},
  file = {material/StreamingCC-MTAAP10.pdf},
  abstract = {We present a new approach for parallel massive graph analysis of streaming, temporal data with a dynamic and extensible representation.  Handling the constant stream of new data from health care, security, business, and social network applications requires new algorithms and data structures.  We examine data structure and algorithm trade-offs that extract the parallelism necessary for high-performance updating analysis of massive graphs.  Static analysis kernels often rely on storing input data in a specific structure.  Maintaining these structures for each possible kernel with high data rates incurs a significant performance cost.  A case study computing clustering coefficients on a general-purpose data structure demonstrates incremental updates can be more efficient than global recomputation.  Within this kernel, we compare three methods for dynamically updating local clustering coefficients: a brute-force local recalculation, a sorting algorithm, and our new approximation method using a Bloom filter.  On 32 processors of a Cray XMT with a synthetic scale-free graph of $2^{24} \approx 16$ million vertices and $2^{29} \approx 537$ million edges, the brute-force method processes a mean of over 50\,000 updates per second and our Bloom filter approaches 200\,000 updates per second.},
  projtag = {cassmt, intel-sting},
  keywords = {hpda, graph analysis, streaming data, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@inproceedings{icpp10,
  author = {David Ediger and Karl Jiang and E. Jason Riedy and David A. Bader and Courtney Corley and Rob Farber and William N. Reynolds},
  ejr-withauthor = {David Ediger and Karl Jiang and David A. Bader and Courtney Corley and Rob Farber and William N. Reynolds},
  title = {Massive Social Network Analysis: Mining Twitter for Social Good},
  booktitle = {39th International Conference on Parallel Processing ({ICPP})},
  role = {proceedings},
  opttags = {parallel; graph},
  year = 2010,
  address = {San Diego, CA},
  month = {September},
  dom = 16,
  acc-note = {(70/225 papers accepted: 31.1\% acceptance rate)},
  doi = {10.1109/ICPP.2010.66},
  file = {material/ICPP10-GraphCT.pdf},
  abstract = {Social networks produce an enormous quantity of data.  Facebook consists of over 400 million active users sharing over 5 \emph{billion} pieces of information each month.  Analyzing this vast quantity of unstructured data presents challenges for software and hardware.  We present GraphCT, a \emph{Graph} \emph{C}haracterization \emph{T}ooklit for massive graphs representing social network data.  On a 128-processor Cray XMT, GraphCT estimates the betweenness centrality of an artificially generated (R-MAT) 537 million vertex, 8.6 billion edge graph in 55 minutes.  We use GraphCT to analyze public data from Twitter, a microblogging network.  Twitter's message connections appear primarily tree-structured as a news dissemination system.  Within the public data, however, are clusters of conversations.  Using GraphCT, we can rank actors within these conversations and help analysts focus attention on a much smaller data subset.},
  projtag = {cassmt, intel-sting},
  keywords = {hpda, graph analysis, streaming data, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@incollection{rwp10,
  author = {E. Jason Riedy},
  editor = {Dana Martin Guthrie},
  booktitle = {Read Write Poem NaPoWriMo Anthology},
  title = {Here, on the farthest point of the peninsula},
  publisher = {\href{http://issuu.com}{issuu.com}},
  year = {2010},
  month = {September},
  dom = {15},
  pages = {86},
  url = {http://issuu.com/readwritepoem/docs/read_write_poem_napowrimo_anthology},
  myurl = {http://lovesgoodfood.com/jason/posts/post-0099/},
  role = {poetry},
  opttags = {beach; napowrimo; poetry; rwp},
  keywords = {poetry}
}
@inproceedings{nsfaccelws10,
  file = {material/nsf-workshop-socnet.pdf},
  author = {Jason Riedy and David Bader and David Ediger},
  ejr-withauthor = {David Bader and David Ediger},
  title = {Applications in Social Networks},
  booktitle = {NSF Workshop on Accelerators for Data-Intensive Applications},
  dom = 13,
  year = 2010,
  month = {October},
  optrole = {presentation},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/nsf-workshop-socnet.pdf},
  opttags = {graph; NSF; streaming},
  projtag = {intel-sting, cassmt},
  keywords = {hpda, parallel algorithm, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@unpublished{graph500-1.1,
  author = {David A. Bader and Jonathan Berry and Simon Kahan and Richard Murphy and E. Jason Riedy and Jeremiah Willcock},
  ejr-withauthor = {David A. Bader and Jonathan Berry and Simon Kahan and Richard Murphy and Jeremiah Willcock},
  title = {Graph 500 Benchmark 1 (``Search'')},
  note = {Version 1.1},
  url = {http://www.graph500.org/Specifications.html},
  month = {October},
  year = 2010,
  projtag = {cassmt, percs},
  keywords = {graph analysis, parallel algorithm, mistake},
  ejr-proj = {graph-analysis},
  ejr-grant = {cassmt, percs}
}
@unpublished{nsf-accel-workshop,
  author = {Participants},
  editor = {Viktor K. Prasanna and David A. Bader},
  key = {Report on NSF Workshop on Center Scale Activities Related to Accelerators for Data Intensive Applications},
  title = {{Report on NSF Workshop on Center Scale Activities Related to Accelerators for Data Intensive Applications}},
  note = {This workshop is supported by NSF Grant Number 1051537, in response to the Call for Exploratory Workshop Proposals for Scientific Software Innovation Institutes (S2I2).},
  dom = 31,
  month = {October},
  year = 2010,
  keywords = {high performance data analysis, accelerator, parallel algorithm},
  ejr-proj = {high-performance-data-analysis},
  ejr-grant = {xscala}
}
@inproceedings{ppopp11-graph,
  author = {David A. Bader and David Ediger and E. Jason Riedy},
  ejr-withauthor = {David A. Bader and David Ediger},
  title = {Parallel Programming for Graph Analysis},
  booktitle = {16th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming (PPoPP)},
  role = {tutorial},
  opttags = {parallel; graph},
  year = 2011,
  month = {February},
  dom = 12,
  address = {San Antonio, TX},
  url = {http://www.cc.gatech.edu/~bader/papers/GraphAnalysisTutorial-PPoPP2011.html},
  abstract = {An increasingly fast-paced, digital world has produced an ever-growing volume of petabyte-sized datasets. At the same time, terabytes of new, unstructured data arrive daily. As the desire to ask more detailed questions about these massive streams has grown, parallel software and hardware have only recently begun to enable complex analytics in this non-scientific space.  In this tutorial, we will discuss the open problems facing us with analyzing this "data deluge". We will present algorithms and data structures capable of analyzing spatio-temporal data at massive scale on parallel systems. We will try to understand the difficulties and bottlenecks in parallel graph algorithm design on current systems and will show how multithreaded and hybrid systems can overcome these challenges. We will demonstrate how parallel graph algorithms can be implemented on a variety of architectures using different programming models.  The goal of this tutorial is to provide a comprehensive introduction to the field of parallel graph analysis to an audience with computing background, interested in participating in research and/or commercial applications of this field. Moreover, we will cover leading-edge technical and algorithmic developments in the field and discuss open problems and potential solutions.},
  projtag = {cassmt},
  keywords = {graph analysis, high performance data analysis, streaming data},
  ejr-proj = {hpda, graph-analysis},
  ejr-grant = {cassmt}
}
@techreport{seed-set-tr,
  author = {Riedy, Jason and Bader, David A. and Jiang, Karl and Pande, Pushkar and Sharma, Richa},
  ejr-withauthor = {Bader, David A. and Jiang, Karl and Pande, Pushkar and Sharma, Richa},
  title = {Detecting Communities from Given Seeds in Social Networks},
  institution = {Georgia Institute of Technology},
  year = 2011,
  number = {GT-CSE-11-01},
  month = {February},
  dom = 22,
  file = {material/GT-CSE-11-01.pdf},
  url = {http://hdl.handle.net/1853/36980},
  role = {techreport},
  abstract = {Analyzing massive social networks challenges both high-performance computers and human understanding. These massive networks cannot be visualized easily, and their scale makes applying complex analysis methods computationally expensive. We present a region-growing method for finding a smaller, more tractable subgraph, a community, given a few example seed vertices. Unlike existing work, we focus on a small number of seed vertices, from two to a few dozen. We also present the first comparison between five algorithms for expanding a small seed set into a community. Our comparison applies these algorithms to an R-MAT generated graph component with 240 thousand vertices and 32 million edges and evaluates the community size, modularity, Kullback-Leibler divergence, conductance, and clustering coefficient. We find that our new algorithm with a local modularity maximizing heuristic based on Clauset, Newman, and Moore performs very well when the output is limited to 100 or 1000 vertices. When run without a vertex size limit, a heuristic from McCloskey and Bader generates communities containing around 60\% of the graph's vertices and having a small conductance and modularity appropriate to the result size. A personalized PageRank algorithm based on Andersen, Lang, and Chung also performs well with respect to our metrics.},
  opttags = {graph; social network},
  projtag = {cassmt}
}
@incollection{smallstone10,
  author = {Jason Riedy},
  editor = {Fiona Robyn and Kaspalita},
  booktitle = {pay attention: a river of stones},
  title = {The storm's coming when the chickens spread out},
  publisher = {\href{http://lulu.com}{lulu.com}},
  year = {2011},
  month = {March},
  dom = {2},
  pages = 77,
  myurl = {http://lovesgoodfood.com/jason/posts/river-of-stones-7/},
  url = {https://www.lulu.com/en/us/shop/kaspalita-and-fiona-robyn/pay-attention-a-river-of-stones/ebook/product-1vkzw9kr.html},
  role = {poetry},
  opttags = {poetry; aros; riverofstones},
  keywords = {poetry}
}
@inproceedings{mtaap11,
  author = {David Ediger and E. Jason Riedy and David A. Bader and Henning Meyerhenke},
  ejr-withauthor = {David Ediger and David A. Bader and Henning Meyerhenke},
  title = {Tracking Structure of Streaming Social Networks},
  booktitle = {5th Workshop on Multithreaded Architectures and Applications (MTAAP)},
  role = {proceedings},
  opttags = {parallel; graph; streaming},
  year = 2011,
  month = {May},
  abstract = {Current online social networks are massive and still growing.  For example, Facebook has over 500 million active users sharing over 30 billion items per month.  The scale within these data streams has outstripped traditional graph analysis methods.  Monitoring requires dynamic analysis rather than repeated static analysis.  The massive state behind multiple persistent queries requires shared data structures and not problem-specific representations.  We present a framework based on the STINGER data structure that can monitor a global property, connected components, on a graph of 16 million vertices at rates of up to 240\,000 updates per second on a 32 processor Cray XMT.  For very large scale-free graphs, our implementation uses novel batching techniques that exploit the scale-free nature of the data and run over three times faster than prior methods.  Our framework handles, for the first time, real-world data rates, opening the door to higher-level analytics such as community and anomaly detection.},
  acc-note = {(10/17 papers accepted, 59\% acceptance rate)},
  doi = {10.1109/IPDPS.2011.326},
  file = {material/TrackingComponents-MTAAP11.pdf},
  projtag = {cassmt, intel-sting},
  keywords = {hpda, graph analysis, streaming data, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@misc{GraphEx11,
  author = {Jason Riedy and David Ediger and David A. Bader and Henning Meyerhenke},
  ejr-withauthor = {David Ediger and David A. Bader and Henning Meyerhenke},
  title = {Tracking Structure of Streaming Social Networks},
  dom = 9,
  month = {August},
  year = 2011,
  note = {Invited presentation.},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/GraphEx-2011.pdf},
  file = {material/GraphEx-2011.pdf},
  opttags = {graph; streaming},
  howpublished = {2011 Graph Exploitation Symposium hosted by MIT Lincoln Labs},
  projtag = {intel-sting, cassmt},
  keywords = {hpda, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {cassmt, intel-sting}
}
@misc{intel.graph.2011,
  file = {material/intel-2011-08-09.pdf},
  author = {Jason Riedy and David A. Bader and Henning Meyerhenke and David Ediger and Timothy Mattson},
  ejr-withauthor = {David A. Bader and Henning Meyerhenke and David Ediger and Timothy Mattson},
  title = {{STING}: Spatio-Temporal Interaction Networks and Graphs for {Intel} Platforms},
  howpublished = {Presentation at Intel Corporation, Santa Clara, CA},
  dom = 9,
  month = {August},
  year = 2011,
  optrole = {presentation},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/GT-STING-for-Intel-beamer.pdf},
  projtag = {intel-sting, cassmt},
  keywords = {hpda, parallel algorithm, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@inproceedings{ppam11,
  author = {E. Jason Riedy and Henning Meyerhenke and David Ediger and David A. Bader},
  ejr-withauthor = {Henning Meyerhenke and David Ediger and David A. Bader},
  title = {Parallel Community Detection for Massive Graphs},
  booktitle = {9th International Conference on Parallel Processing and Applied Mathematics (PPAM11)},
  year = 2011,
  month = {September},
  publisher = {Springer},
  role = {proceedings},
  opttags = {parallel; graph; community detection},
  abstract = {Tackling the current volume of graph-structured data requires parallel tools.  We extend our work on analyzing such massive graph data with the first massively parallel algorithm for community detection that scales to current data sizes, scaling to graphs of over 122 million vertices and nearly 2 billion edges in under 7300 seconds on a massively multithreaded Cray XMT.  Our algorithm achieves moderate parallel scalability without sacrificing sequential operational complexity.  Community detection partitions a graph into subgraphs more densely connected within the subgraph than to the rest of the graph.  We take an agglomerative approach similar to Clauset, Newman, and Moore's sequential algorithm, merging pairs of connected intermediate subgraphs to optimize different graph properties.  Working in parallel opens new approaches to high performance.  On smaller data sets, we find the output's modularity compares well with the standard sequential algorithms.},
  acc-note = {(134/243 papers accepted, 55\% acceptance rate)},
  doi = {10.1007/978-3-642-31464-3\_29},
  file = {material/ppam11-community-detection.pdf},
  projtag = {cassmt, intel-sting},
  keywords = {hpda, graph analysis, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@inproceedings{md11-graph,
  author = {David A. Bader and David Ediger and E. Jason Riedy},
  ejr-withauthor = {David A. Bader and David Ediger},
  title = {Parallel Programming for Graph Analysis},
  booktitle = {full day tutorial},
  role = {tutorial},
  opttags = {parallel; graph},
  year = 2011,
  month = {September},
  dom = 28,
  address = {Columbia, MD},
  projtag = {cassmt},
  keywords = {graph analysis, high performance data analysis, streaming data},
  ejr-proj = {hpda, graph-analysis},
  ejr-grant = {cassmt}
}
@inproceedings{DBLP:conf_dimacs_RiedyMEB12,
  file = {material/dimacs10-proceedings-community-detection.pdf},
  title = {Parallel community detection for massive graphs},
  author = {E. Jason Riedy and Henning Meyerhenke and David Ediger and David A. Bader},
  booktitle = {Graph Partitioning and Graph Clustering},
  year = {2012},
  editor = {David A. Bader and Henning Meyerhenke and Peter Sanders and Dorothea Wagner},
  pages = {207--222},
  publisher = {American Mathematical Society},
  series = {Contemporary Mathematics},
  volume = {588},
  abstract = {Tackling the current volume of graph-structured data requires parallel tools. We extend our work on analyzing such massive graph data with a massively parallel algorithm for community detection that scales to current data sizes, clustering a real-world graph of over 100 million vertices and over 3 billion edges in under 500 seconds on a four-processor Intel E7-8870-based server.  Our algorithm achieves moderate parallel scalability without sacrificing sequential operational complexity.  Community detection partitions a graph into subgraphs more densely connected within the subgraph than to the rest of the graph. We take an agglomerative approach similar to Clauset, Newman, and Moore’s sequential algorithm, merging pairs of connected intermediate subgraphs to optimize different graph properties. Working in parallel opens new approaches to high performance. We improve performance of our parallel community detection algorithm on both the Cray XMT2 and OpenMP platforms and adapt our algorithm to the DIMACS Implementation Challenge data set.},
  bibsource = {DBLP, http://dblp.uni-trier.de},
  doi = {10.1090/conm/588/11703},
  ee = {http://www.ams.org/books/conm/588/11703},
  optisbn = {978-0-8218-9038-7, 978-0-8218-9869-7},
  isbn = {978-0-8218-9038-7},
  opttitle = {Graph Partitioning and Graph Clustering - 10th DIMACS Implementation Challenge Workshop, Georgia Institute of Technology, Atlanta, GA, USA, February 13-14, 2012. Proceedings},
  opturl = {http://www.ams.org/books/conm/588/11703/conm588-11703.pdf},
  projtag = {cassmt, intel-sting},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {cassmt, intel-sting},
  keywords = {graph analysis, community detection, hpda, parallel algorithm}
}
@incollection{dimacs10-workshop,
  author = {E. Jason Riedy and Henning Meyerhenke and David Ediger and David A. Bader},
  ejr-withauthor = {Henning Meyerhenke and David Ediger and David A. Bader},
  title = {Parallel Community Detection for Massive Graphs},
  booktitle = {10th DIMACS Implementation Challenge Workshop - Graph Partitioning and Graph Clustering},
  opttags = {parallel; graph; community detection},
  publisher = {(workshop paper)},
  year = 2012,
  month = {February},
  dom = 14,
  address = {Atlanta, Georgia},
  note = {Won first place in the Mix Challenge and Mix Pareto Challenge},
  url = {http://www.cc.gatech.edu/dimacs10/papers/\&\#91;15\&\#93;-dimacs10-community-detection.pdf},
  file = {material/dimacs10-community-detection.pdf},
  abstract = {Tackling the current volume of graph-structured data requires parallel tools. We extend our work on analyzing such massive graph data with a massively parallel algorithm for community detection that scales to current data sizes, clustering a real-world graph of over 100 million vertices and over 3 billion edges in under 500 seconds on a four- processor Intel E7-8870-based server. Our algorithm achieves moderate parallel scalability without sacrificing sequential operational complexity.  Community detection partitions a graph into subgraphs more densely connected within the subgraph than to the rest of the graph. We take an agglomerative approach similar to Clauset, Newman, and Moore’s sequential algorithm, merging pairs of connected intermediate subgraphs to optimize different graph properties. Working in parallel opens new approaches to high performance. We improve performance of our parallel community detection algorithm on both the Cray XMT2 and OpenMP platforms and adapt our algorithm to the DIMACS Implementation Challenge data set.},
  projtag = {cassmt, intel-sting},
  keywords = {hpda, graph analysis, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@inproceedings{ppopp12-graph,
  author = {David Ediger and Jason Riedy and Rob McColl and David A. Bader},
  ejr-withauthor = {David Ediger and Rob McColl and David A. Bader},
  title = {Parallel Programming for Graph Analysis},
  booktitle = {17th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming (PPoPP)},
  role = {tutorial},
  opttags = {parallel; graph},
  year = 2012,
  month = {February},
  dom = 26,
  address = {New Orleans, LA},
  url = {http://www.cc.gatech.edu/~bader/papers/GraphAnalysisTutorial-PPoPP2012.html},
  abstract = {An increasingly fast-paced, digital world has produced an ever-growing volume of petabyte-sized datasets. At the same time, terabytes of new, unstructured data arrive daily. As the desire to ask more detailed questions about these massive streams has grown, parallel software and hardware have only recently begun to enable complex analytics in this non-scientific space.  In this tutorial, we will discuss the open problems facing us with analyzing this "data deluge". We will present algorithms and data structures capable of analyzing spatio-temporal data at massive scale on parallel systems. We will try to understand the difficulties and bottlenecks in parallel graph algorithm design on current systems and will show how multithreaded and hybrid systems can overcome these challenges. We will demonstrate how parallel graph algorithms can be implemented on a variety of architectures using different programming models.  The goal of this tutorial is to provide a comprehensive introduction to the field of parallel graph analysis to an audience with computing background, interested in participating in research and/or commercial applications of this field. Moreover, we will cover leading-edge technical and algorithmic developments in the field and discuss open problems and potential solutions.},
  officialproject = {nsf-s2i2-conc},
  projtag = {cassmt},
  keywords = {graph analysis, high performance data analysis, streaming data},
  ejr-proj = {hpda, graph-analysis},
  ejr-grant = {cassmt}
}
@misc{pp12-streaming-ms,
  file = {material/siam-pp-2012.pdf},
  author = {E. Jason Riedy and Henning Meyerhenke},
  ejr-withauthor = {Henning Meyerhenke},
  title = {Scalable Algorithms for Analysis of Massive, Streaming Graphs},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = 15,
  month = {February},
  year = 2012,
  note = {Minisymposium organizer with Henning Meyerhenke.},
  url = {http://www.slideshare.net/jasonriedy/siam-pp-2012-scalable-algorithms-for-analysis-of-massive-streaming-graphs},
  optrole = {presentation},
  opttags = {siam; streaming data; parallel algorithms},
  address = {Savannah, GA},
  abstract = {Graph-structured data in social networks, finance, network security, and others not only are massive but also under continual change. These changes often are scattered across the graph. Repeating complex global analyses on massive snapshots to capture only what has changed is inefficient. We discuss analysis algorithms for streaming graph data that maintain both local and global metrics. We extract parallelism from both analysis kernel and graph data to scale performance to real-world sizes.},
  projtag = {intel-sting, cassmt},
  keywords = {hpda, parallel algorithm, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@misc{pp12-community-ms,
  author = {Henning Meyerhenke and E. Jason Riedy and David A. Bader},
  ejr-withauthor = {Henning Meyerhenke and David A. Bader},
  title = {Parallel Community Detection in Streaming Graphs},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = 15,
  month = {February},
  year = 2012,
  role = {minisymposium organizer},
  optrole = {presentation},
  opttags = {siam; streaming data; parallel algorithms},
  address = {Savannah, GA},
  projtag = {intel-sting, cassmt},
  keywords = {hpda, parallel algorithm, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@misc{pp12-GraphCT,
  author = {David Ediger and E. Jason Riedy and Henning Meyerhenke and David A. Bader},
  eir-withauthor = {David Ediger and Henning Meyerhenke and David A. Bader},
  title = {Analyzing Massive Networks with GraphCT},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = 16,
  month = {February},
  year = 2012,
  role = {poster},
  opttags = {siam; parallel algorithms},
  address = {Savannah, GA},
  projtag = {intel-sting, cassmt},
  keywords = {hpda, parallel algorithm, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@misc{pp12-STING,
  file = {material/siam-pp12-stinger-poster.pdf},
  author = {E. Jason Riedy and David Ediger and Henning Meyerhenke and David A. Bader},
  eir-withauthor = {David Ediger and Henning Meyerhenke and David A. Bader},
  title = {{STING}: Software for Analysis of Spatio-Temporal Interaction Networks and Graphs},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = 16,
  month = {February},
  year = 2012,
  role = {poster},
  opttags = {siam; parallel algorithms},
  address = {Savannah, GA},
  abstract = {Current tools for analyzing graph-structured data and semantic networks focus on static graphs. Our STING package tackles analysis of streaming graphs like today's social networks and communication tools. STING maintains a massive graph under changes while coordinating analysis kernels to achieve analysis at real-world data rates. We show examples of local metrics like clustering coefficients and global metrics like connected components and agglomerative clustering. STING supports parallel Intel architectures as well as the Cray XMT.},
  projtag = {intel-sting, cassmt},
  keywords = {hpda, parallel algorithm, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@incollection{icassp2012-stinger,
  author = {Jason Riedy and Henning Meyerhenke and David A. Bader and David Ediger and Timothy G. Mattson},
  ejr-withauthor = {Henning Meyerhenke and David Ediger and David A. Bader and Timothy G. Mattson},
  booktitle = {{IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})},
  title = {Analysis of Streaming Social Networks and Graphs on Multicore Architectures},
  year = 2012,
  month = {March},
  dom = 29,
  address = {Kyoto, Japan},
  file = {material/icassp2012.pdf},
  slide-url = {http://ur1.ca/i6dz6},
  doi = {10.1109/ICASSP.2012.6289126},
  url = {http://www.slideshare.net/jasonriedy/icassp-2012-analysis-of-streaming-social-networks-and-graphs-on-multicore-architectures},
  abstract = {Analyzing static snapshots of massive, graph-structured data cannot keep pace with the growth of social networks, financial transactions, and other valuable data sources.  We introduce a framework, STING (Spatio-Temporal Interaction Networks and Graphs), and evaluate its performance on multicore, multisocket Intel(R)-based platforms.  STING achieves rates of around 100\,000 edge updates per second on large, dynamic graphs with a single, general data structure.  We achieve speed-ups of up to 1000$\times$ over parallel static computation, improve monitoring a dynamic graph's connected components, and show an exact algorithm for maintaining local clustering coefficients performs better on Intel-based platforms than our earlier approximate algorithm.},
  projtag = {cassmt, intel-sting},
  keywords = {hpda, graph analysis, streaming data, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@inproceedings{mtaap12,
  author = {E. Jason Riedy and David A. Bader and Henning Meyerhenke},
  ejr-withauthor = {David A. Bader and Henning Meyerhenke},
  title = {Scalable Multi-threaded Community Detection in Social Networks},
  booktitle = {6th Workshop on Multithreaded Architectures and Applications (MTAAP)},
  role = {proceedings},
  opttags = {parallel; graph; community detection},
  year = 2012,
  month = {May},
  dom = 25,
  abstract = {The volume of existing graph-structured data requires improved parallel tools and algorithms.  Finding communities, smaller subgraphs densely connected within the subgraph than to the rest of the graph, plays a role both in developing new parallel algorithms as well as opening smaller portions of the data to current analysis tools.  We improve performance of our parallel community detection algorithm by 20\% on the massively multithreaded Cray XMT, evaluate its performance on the next-generation Cray XMT2, and extend its reach to Intel-based platforms with OpenMP.  To our knowledge, not only is this the first massively parallel community detection algorithm but also the only such algorithm that achieves excellent performance and good parallel scalability across all these platforms.  Our implementation analyzes a moderate sized graph with 105 million vertices and 3.3 billion edges in around 500 seconds on a four processor, 80-logical-core Intel-based system and 1100 seconds on a 64-processor Cray XMT2.},
  acc-note = {(9/15 papers accepted, 60\% acceptance)},
  slide-url = {http://www.slideshare.net/jasonriedy/mtaap12-scalable-community-detection},
  doi = {10.1109/IPDPSW.2012.203},
  file = {material/mtaap12.pdf},
  projtag = {cassmt, intel-sting},
  keywords = {hpda, graph analysis, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@misc{intel.graph.2012,
  file = {material/intel-2012-07-12.pdf},
  author = {Jason Riedy and David A. Bader and David Ediger and Rob McColl and Timothy G. Mattson},
  ejr-withauthor = {David A. Bader and David Ediger and Rob McColl and Timothy G. Mattson},
  title = {{STING}: Spatio-Temporal Interaction Networks and Graphs for {Intel} Platforms},
  howpublished = {Presentation at Intel Corporation, Santa Clara, CA},
  dom = 24,
  month = {July},
  year = 2012,
  optrole = {presentation},
  url = {http://www.slideshare.net/jasonriedy/gt-stingintelslides},
  projtag = {intel-sting},
  keywords = {hpda, parallel algorithm, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting}
}
@misc{an12-streaming-ms,
  file = {material/siam-an-2012.pdf},
  author = {David A. Bader and David Ediger and Jason Riedy},
  ejr-withauthor = {David A. Bader and David Ediger},
  title = {Streaming Graph Analytics for Massive Graphs},
  howpublished = {SIAM Annual Meeting},
  dom = 10,
  month = {July},
  year = 2012,
  url = {http://www.slideshare.net/jasonriedy/streaming-graph-analytics-for-massive-graphs},
  optrole = {presentation},
  opttags = {siam; streaming data; parallel algorithms},
  address = {Minneapolis, MN},
  abstract = {Emerging real-world graph problems include detecting community structure in large social networks, improving the resilience of the electric power grid, and detecting and preventing disease in human populations. The volume and richness of data combined with its rate of change renders monitoring properties at scale by static recomputation infeasible. We approach these problems with massive, fine-grained parallelism across different shared memory architectures both to compute solutions and to explore the sensitivity of these solutions to natural bias and omissions within the data.},
  projtag = {intel-sting, cassmt},
  keywords = {hpda, parallel algorithm, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@inproceedings{stinger-hpec12,
  author = {David Ediger and Robert McColl and Jason Riedy and David A. Bader},
  ejr-withauthor = {David Ediger and Robert McColl and David A. Bader},
  title = {{STINGER}: High Performance Data Structure for Streaming Graphs},
  opttags = {parallel; graph; streaming},
  booktitle = {The IEEE High Performance Extreme Computing Conference (HPEC)},
  year = 2012,
  month = {September},
  address = {Waltham, MA},
  note = {Best paper award},
  dom = 12,
  role = {proceedings},
  doi = {10.1109/HPEC.2012.6408680},
  file = {material/hpec12-stinger.pdf},
  abstract = {The current research focus on ``big data'' problems highlights the scale and complexity of analytics required and the high rate at which data may be changing. In this paper, we present our high performance, scalable and portable software, Spatio-Temporal Interaction Networks and Graphs Extensible Representation (STINGER), that includes a graph data structure that enables these applications. Key attributes of STINGER are fast insertions, deletions, and updates on semantic graphs with skewed degree distributions. We demonstrate a process of algorithmic and architectural optimizations that enable high performance on the Cray XMT family and Intel multicore servers. Our implementation of STINGER on the Cray XMT processes over 3 million updates per second on a scale-free graph with 537 million edges.},
  projtag = {cassmt, intel-sting},
  keywords = {hpda, graph analysis, streaming data, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt}
}
@misc{hpcs-panel-2012,
  author = {Lauren L. Smith and Dolores A. Shaffer},
  title = {{DARPA}'s {H}igh {P}roductivity {C}omputing {S}ystems Program: A Final Report},
  howpublished = {Supercomputing Birds-of-a-Feather session},
  dom = 14,
  month = {November},
  year = 2012,
  note = {Invited panel speaker},
  abstract = {The DARPA High Productivity Computing Systems (HPCS) program has been focused on providing a new generation of economically viable high productivity computing systems for national security, scientific, industrial and commercial applications. This program was unique because it focused on system productivity that was defined to include enhancing performance, programmability, portability, usability, manageability and robustness of systems as opposed to just being focused on one execution time performance metric. The BOF is for anyone interested in learning about the two HPCS systems and how productivity in High Performance Computing has been enhanced.},
  projtag = {percs, cassmt},
  keywords = {hpda, graph analysis, streaming data, novel architectures},
  ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch},
  ejr-grant = {percs}
}
@article{DBLP:journals/corr/SwensonSPPRBV13,
  author = {Shel Swenson and
               Yogesh Simmhan and
               Viktor K. Prasanna and
               Manish Parashar and
               E. Jason Riedy and
               David A. Bader and
               Richard W. Vuduc},
  title = {Sustainable Software Development for Next-Gen Sequencing {(NGS)} Bioinformatics
               on Emerging Platforms},
  journal = {CoRR},
  volume = {abs/1309.1828},
  year = {2013},
  url = {http://arxiv.org/abs/1309.1828},
  archiveprefix = {arXiv},
  opteprint = {1309.1828},
  timestamp = {Mon, 13 Aug 2018 16:49:01 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/SwensonSPPRBV13},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@misc{siamcse13-largescalegraph,
  author = {David A. Bader and Henning Meyerhenke and Jason Riedy},
  ejr-withauthor = {David A. Bader and Henning Meyerhenke},
  title = {Applications and Challenges in Large-scale Graph Analysis},
  howpublished = {SIAM Conference on Computational Science and Engineering},
  month = {February},
  year = 2013,
  address = {Boston, MA},
  optrole = {presentation},
  opttags = {siam; parallel algorithms},
  url = {http://www.graphanalysis.org/SIAM-CSE13/01_Bader.pdf},
  abstract = {Emerging real-world graph problems include detecting community structure in large social networks, improving the resilience of the electric power grid, and detecting and preventing disease in human populations. We discuss the opportunities and challenges in massive data-intensive computing for applications in social network analysis, genomics, and security. The explosion of real-world graph data poses substantial challenges for software, hardware, algorithms, and application experts.},
  projtag = {intel-sting, cassmt, grateful},
  keywords = {hpda, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt, grateful}
}
@misc{siamcse13-streaminggraph,
  file = {material/cse2013-streaming.pdf},
  author = {Robert C. McColl and David Ediger and David A. Bader and Jason Riedy},
  ejr-withauthor = {Robert C. McColl and David Ediger and David A. Bader},
  title = {Analyzing Graph Structure in Streaming Data with {STINGER}},
  howpublished = {SIAM Conference on Computational Science and Engineering},
  month = {February},
  year = 2013,
  address = {Boston, MA},
  optrole = {presentation},
  opttags = {siam; streaming data; parallel algorithms},
  abstract = {Analyzing static snapshots of massive, graph-structured data cannot keep pace with the growth of social networks, financial transactions, and other valuable data sources. Our software framework, STING (Spatio-Temporal Interaction Networks and Graphs), uses a scalable, high-performance graph data structure to enable these applications. STING supports fast insertions, deletions, and updates on graphs with semantic information and skewed degree distributions. STING achieves large speed-ups over parallel, static recomputation on both common multicore and specialized multithreaded platforms.},
  projtag = {intel-sting, cassmt, grateful},
  keywords = {hpda, parallel algorithm, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt, grateful}
}
@article{Riedy:2013:MSD:2425676.2425689,
  author = {Riedy, Jason and Bader, David A.},
  title = {Massive Streaming Data Analytics: A Graph-based Approach},
  journal = {XRDS: Crossroads, The ACM Magazine for Students --- Scientific Computing},
  issue_date = {Spring 2013},
  volume = {19},
  number = {3},
  month = {March},
  year = {2013},
  issn = {1528-4972},
  pages = {37--43},
  numpages = {7},
  opturl = {http://dx.doi.org/10.1145/2425676.2425689},
  doi = {10.1145/2425676.2425689},
  file = {material/xrds-2013.pdf},
  acmid = {2425689},
  publisher = {ACM},
  address = {New York, NY, USA},
  abstract = {Analyzing massive streaming graphs efficiently requires new algorithms, data structures, and computing platforms.},
  projtag = {cassmt},
  keywords = {graph analysis, high performance data analysis, streaming data},
  ejr-proj = {hpda, graph-analysis},
  ejr-grant = {cassmt}
}
@inproceedings{mtaap13,
  file = {material/mtaap13-streaming-community-monitoring.pdf},
  author = {E. Jason Riedy and David A. Bader},
  ejr-withauthor = {David A. Bader},
  title = {Multithreaded Community Monitoring for Massive Streaming Graph Data},
  booktitle = {7th Workshop on Multithreaded Architectures and Applications (MTAAP)},
  role = {proceedings},
  opttags = {parallel; graph; streaming; community detection},
  year = 2013,
  month = {May},
  dom = 24,
  doi = {10.1109/IPDPSW.2013.229},
  address = {Boston, MA},
  acc-note = {(11/16 papers accepted, 69\% acceptance)},
  abstract = {Analyzing static snapshots of massive, graph-structured data cannot keep pace with the growth of social networks, financial transactions, and other valuable data sources.  Current state-of-the-art industrial methods analyze these streaming sources using only simple, aggregate metrics.  There are few existing scalable algorithms for monitoring complex global quantities like decomposition into community structure.  Using our framework STING, we present the first known parallel algorithm specifically for monitoring communities in this massive, streaming, graph-structured data.  Our algorithm performs incremental re-agglomeration rather than starting from scratch after each batch of changes, reducing the problem's size to that of the change rather than the entire graph.  We analyze our initial implementation's performance on multithreaded platforms for execution time and latency.  On an Intel-based multithreaded platform, our algorithm handles up to 100 million updates per second on social networks with one to 30 million edges, providing a speed-up from 4$\times$ to 3700$\times$ over statically recomputing the decomposition after each batch of changes.  Possibly because of our artificial graph generator, resulting communities' modularity varies little from the initial graph.},
  projtag = {xscala, grateful, cassmt, intel-sting},
  keywords = {hpda, graph analysis, streaming data, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {grateful, xscala, intel-sting, cassmt}
}
@unpublished{s2i2-ipdps-2013,
  author = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna and Manish Parashar and David Bader and Jason Riedy and Richard Vuduc},
  title = {Report on ``Workshop on Accelerating Bioinformatics Applications Enabled by NextGen-Sequencing''},
  address = {Boston, MA},
  dom = 19,
  month = {May},
  year = 2013,
  note = {Co-located with IPDPS 2013},
  url = {http://future-compute.usc.edu/index.php/NGS_Workshop},
  officialproject = {nsf-s2i2-conc},
  projtag = {xscala},
  keywords = {high performance data analysis, accelerator, parallel algorithm},
  ejr-proj = {high-performance-data-analysis},
  ejr-grant = {xscala}
}
@incollection{GraphCT-Wiley-Chap,
  file = {material/STINGER-with-apps.pdf},
  title = {Computational Graph Analytics for Massive Streaming Data},
  author = {David Ediger and Jason Riedy and David A. Bader and Henning Meyerhenke},
  booktitle = {Large Scale Network-Centric Computing Systems},
  publisher = {Wiley},
  year = {2013},
  chapter = {25},
  editor = {Hamid Sarbazi-azad and Albert Zomaya},
  month = {July},
  series = {Parallel and Distributed Computing},
  abstract = {Handling the constant stream of data from health care, security, business, and social network applications requires new algorithms and data structures.  We present a new approach for parallel massive analysis of streaming, temporal, graph-structured data.  For this purpose we examine data structure and algorithm trade-offs that extract the parallelism necessary for high-performance updating analysis of massive graphs. As a result of this study, we propose the extensible and flexible data structure for massive graphs called STINGER ({S}patio-{T}emporal {I}nteraction {N}etworks and {G}raphs {E}xtensible {R}epresentation).  Two case studies demonstrate our new approach's effectiveness.  The first one computes a dynamic graph's vertices' \emph{clustering coefficients}.  We show that incremental updates are far more efficient than global recomputation.  Within this kernel, we compare three methods for dynamically updating local clustering coefficients: a brute-force local recalculation, a sorting algorithm, and our new approximation method using a Bloom filter.  On 32 processors of a \xmt{} with a synthetic scale-free graph of $2^{24} \approx 16$ million vertices and $2^{29} \approx 537$ million edges, the brute-force method processes a mean of over 50\,000 updates per second, while our Bloom filter approaches 200\,000 updates per second.  The second case study monitors a global feature, a dynamic graph's connected components.  We use similar algorithmic ideas as before to exploit the parallelism in the problem and provided by the hardware architecture.  On a 16 million vertex graph, we obtain rates of up to 240\,000 updates per second on 32 processors of a \xmt{}.  For the large scale-free graphs typical in our applications, our implementation uses novel batching techniques that exploit the scale-free nature of the data and run over three times faster than prior methods.  Our new framework is the first to handle real-world data rates, opening the door to higher-level analytics such as community and anomaly detection.},
  dom = {30},
  isbn = {978-0470936887},
  role = {chapter},
  keywords = {parallel algorithm, hpda, graph analysis, streaming data},
  doi = {10.1002/9781118640708.ch25},
  projtag = {cassmt, xscala, intel-sting},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {cassmt, xscala, intel-sting}
}
@misc{graphlab-2013,
  file = {material/graphlab13-poster.pdf},
  author = {Jason Riedy},
  title = {{STINGER}: Analyzing massive, streaming graphs},
  howpublished = {2nd GraphLab Workshop},
  month = {July},
  dom = 1,
  year = 2013,
  address = {San Francisco, CA},
  role = {invited poster and demo},
  opttags = {graph analysis; parallel algorithms},
  projtag = {intel-sting, cassmt, xscala, grateful},
  keywords = {hpda, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt, grateful}
}
@article{graphct-tpds-2012,
  author = {David Ediger and Karl Jiang and Jason Riedy and David A. Bader},
  ejr-withauthor = {David Ediger and Karl Jiang and David A. Bader},
  title = {{GraphCT}: Multithreaded Algorithms for Massive Graph Analysis},
  journal = {{IEEE} Transactions in Parallel and Distributed Systems},
  year = 2013,
  pages = {2220 -- 2229},
  month = {September},
  doi = {10.1109/TPDS.2012.323},
  issn = {1045-9219},
  url = {http://dx.doi.org/10.1109/TPDS.2012.323},
  role = {refereed},
  gt-role = {Assisted in software development, experimental methods, writing, and editing.},
  abstract = {The digital world has given rise to massive quantities of data that include rich semantic and complex networks. A social graph, for example, containing hundreds of millions of actors and tens of billions of relationships is not uncommon. Analyzing these large data sets, even to answer simple analytic queries, often pushes the limits of algorithms and machine architectures. We present GraphCT, a scalable framework for graph analysis using parallel and multithreaded algorithms on shared memory platforms. Utilizing the unique characteristics of the Cray XMT, GraphCT enables fast network analysis at unprecedented scales on a variety of input data sets. On a synthetic power law graph with 2 billion vertices and 17 billion edges, we can find the connected components in 2 minutes. We can estimate the betweenness centrality of a similar graph with 537 million vertices and over 8 billion edges in under 1 hour. GraphCT is built for portability and performance.},
  file = {material/GraphCT-IEEE.pdf},
  projtag = {cassmt},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {cassmt}
}
@unpublished{s2i2-acmbcb-2013,
  author = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna and Manish Parashar and David Bader and Jason Riedy and Richard Vuduc},
  title = {Report on ``Workshop on Challenges in accelerating Next-Gen Sequencing ({NGS}) bioinformatics''},
  address = {Washington, DC},
  dom = 25,
  month = {September},
  year = 2013,
  note = {in conjunction with ACM-BCB 2013},
  url = {http://future-compute.usc.edu/index.php/NGS_Bioinformatics_Workshop},
  officialproject = {nsf-s2i2-conc},
  projtag = {xscala},
  keywords = {high performance data analysis, accelerator, parallel algorithm},
  ejr-proj = {high-performance-data-analysis},
  ejr-grant = {xscala}
}
@inproceedings{wssspe1,
  author = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna and Manish Parashar and Jason Riedy and David Bader and Richard Vuduc},
  ejr-withauthor = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna and Manish Parashar and David Bader and Richard Vuduc},
  title = {Sustainable Software Development for Next-Gen Sequencing (NGS) Bioinformatics on Emerging Platforms},
  booktitle = {First Workshop on Sustainable Software for Science: Practice and Experiences (WSSSPE1)},
  year = 2013,
  month = {November},
  dom = 17,
  address = {Denver, CO},
  note = {held in conjunction with SC13, published electronically (\url{http://wssspe.researchcomputing.org.uk/})},
  url = {http://arxiv.org/abs/1309.1828},
  file = {material/wssspe13.pdf},
  abstract = {DNA sequence analysis is fundamental to life science research. The rapid development of next generation sequencing (NGS) technologies, and the richness and diversity of applications it makes feasible, have created an enormous gulf between the potential of this technology and the development of computational methods to realize this potential. Bridging this gap holds possibilities for broad impacts toward multiple grand challenges and offers unprecedented opportunities for software innovation and research. We argue that NGS-enabled applications need a critical mass of sustainable software to benefit from emerging computing platforms' transformative potential. Accumulating the necessary critical mass will require leaders in computational biology, bioinformatics, computer science, and computer engineering work together to identify core opportunity areas, critical software infrastructure, and software sustainability challenges. Furthermore, due to the quickly changing nature of both bioinformatics software and accelerator technology, we conclude that creating sustainable accelerated bioinformatics software means constructing a sustainable bridge between the two fields. In particular, sustained collaboration between domain developers and technology experts is needed to develop the accelerated kernels, libraries, frameworks and middleware that could provide the needed flexible link from NGS bioinformatics applications to emerging platforms.},
  officialproject = {nsf-s2i2-conc},
  projtag = {xscala},
  keywords = {high performance data analysis, accelerator, parallel algorithm},
  ejr-proj = {high-performance-data-analysis},
  ejr-grant = {xscala}
}
@misc{intel.graph.2014,
  file = {material/intel-2014-01-17.pdf},
  author = {Jason Riedy and David A. Bader and David Ediger and Rob McColl and Timothy G. Mattson},
  ejr-withauthor = {David A. Bader and David Ediger and Rob McColl and Timothy G. Mattson},
  title = {{STING}: Spatio-Temporal Interaction Networks and Graphs for {Intel} Platforms},
  howpublished = {Presentation at Intel Corporation, Santa Clara, CA},
  dom = 17,
  month = {January},
  year = 2014,
  optrole = {presentation},
  url = {http://www.slideshare.net/jasonriedy/intel-20140117},
  projtag = {intel-sting},
  keywords = {hpda, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt, grateful}
}
@inproceedings{stinger-gabb2014,
  file = {material/gabb-2014-pres.pdf},
  author = {Jason Riedy and David A. Bader},
  title = {{STINGER}: Multi-threaded Graph Streaming},
  dom = 19,
  booktitle = {{Graph} {Algorithms} {Building} {Blocks} ({GABB} 2014)},
  year = 2014,
  month = {May},
  address = {Phoeniz, AZ},
  note = {Invited presentation and panelist. (Workshop with IPDPS 2014)},
  keywords = {graph analysis, streaming data, high performance data analysis, parallel algorithm},
  url = {http://www.slideshare.net/jasonriedy/stinger-multithreaded-graph-streaming},
  projtag = {intel-sting, xscala, hpda, percs},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, xscala, cassmt, percs, grateful}
}
@misc{graphlab-2014,
  file = {material/graphlab14-poster.pdf},
  author = {Jason Riedy},
  title = {{STINGER}: Analyzing massive, streaming graphs},
  howpublished = {3rd GraphLab Workshop},
  month = {July},
  dom = 21,
  year = 2014,
  address = {San Francisco, CA},
  role = {invited poster and demo},
  opttags = {graph analysis; parallel algorithms},
  projtag = {intel-sting, hpda, xscala, grateful},
  keywords = {hpda, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {intel-sting, cassmt, grateful}
}
@inproceedings{bc-hpec14,
  author = {Adam McLaughlin and Jason Riedy and David A. Bader},
  ejr-withauthor = {Adam McLaughlin and David A. Bader},
  title = {Optimizing Energy Consumption and Parallel Performance for Betweenness Centrality using {GPU}s},
  opttags = {parallel; graph; energy},
  booktitle = {The IEEE High Performance Extreme Computing Conference (HPEC)},
  year = 2014,
  month = {September},
  address = {Waltham, MA},
  note = {``Rising Stars'' section},
  dom = 11,
  role = {proceedings},
  doi = {10.1109/HPEC.2014.7040980},
  file = {material/Optimizing_BC_HPEC14.pdf},
  abstract = {Applications of high-performance graph analysis range from computational biology to network security and even transportation.  These applications often consider graphs under rapid change and are moving beyond HPC platforms into energy-constrained embedded systems.  This paper optimizes one successful and demanding analysis kernel, betweenness centrality, for NVIDIA GPU accelerators in both environments.  Our algorithm for static analysis is capable of exceeding 2 million traversed edges per second per watt (MTEPS/W).  Optimizing the parallel algorithm and treating the dynamic problem directly achieves a 6.39$\times$ average speed-up and 84\% average reduction in energy consumption.},
  projtag = {xscala, grateful, hpda},
  keywords = {hpda, graph analysis, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {grateful, xscala}
}
@inproceedings{cmg.2014,
  file = {material/cmg-2014-11-04.pdf},
  author = {Jason Riedy and David A. Bader},
  title = {Graph Analysis Trends and Opportunities},
  booktitle = {CMG Performance and Capacity},
  year = 2014,
  month = {November},
  dom = 4,
  address = {Atlanta, GA},
  note = {Invited presentation},
  keywords = {graph analysis, streaming data, high performance data analysis, parallel algorithm},
  url = {http://www.slideshare.net/jasonriedy/cmg-20141104},
  abstract = {High-performance graph analysis is unlocking knowledge in problems like anomaly detection in computer security, community structure in social networks, and many other data integration areas. While graphs provide a convenient abstraction, real-world problems' sparsity and lack of locality challenge current systems. This talk will cover current trends ranging from massive scales to low-power, low-latency systems and summarize opportunities and directions for graphs and computing systems.},
  projtag = {hpda, grateful, xscala, intel-sting},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {hpda, grateful, xscala, intel-sting}
}
@misc{mng-sdm15,
  file = {material/riedy-network-challenge-sdm15.pdf},
  author = {Jason Riedy},
  title = {Network Challenge: Error and Sensitivity Analysis},
  howpublished = {SDM-Networks 2015: The Second SDM Workshop on Mining Networks and Graphs: A Big Data Analytic Challenge},
  dom = 2,
  month = {May},
  year = 2015,
  note = {Invited panelist},
  address = {Vancouver, BC},
  opttags = {graph analysis},
  url = {http://www.slideshare.net/jasonriedy/network-challenge-error-and-sensitivity-analysis},
  projtag = {xscala, grateful, hpda},
  keywords = {graph analysis, sensitivity},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {xscala, grateful, hpda}
}
@inproceedings{hpec15,
  author = {Adam McLaughlin and Jason Riedy and David A. Bader},
  title = {An Energy-Efficient Abstraction for Simultaneous Breadth-First Searches},
  ejr-withauthor = {Adam McLaughlin and David A. Bader},
  opttags = {parallel; graph; energy},
  booktitle = {The IEEE High Performance Extreme Computing Conference (HPEC)},
  year = 2015,
  month = {September},
  address = {Waltham, MA},
  dom = 17,
  role = {proceedings},
  abstract = {Optimized GPU kernels are sufficiently complicated to write that they often are specialized to specific input data, target architectures, or applications.  This paper presents a multi-search abstraction for computing multiple breadth-first searches in parallel and demonstrates a high-performance, general implementation.  Our abstraction removes the burden of orchestrating graph traversal from the user while providing high performance and low energy usage, an often overlooked component of algorithm design.  Energy consumption has become a first-class hardware design constraint for both massive and embedded computing platforms.  Our abstraction can be applied to such problems as the all-pairs shortest-path problem, community detection, reachability querying, and others.  To map graph traversal efficiently to NVIDIA GPUs, our hybrid implementation chooses between processing active vertices with a single thread or an entire warp based on vertex outdegree.  For a set of twelve varied graphs, the implementation of our abstraction saves 42\% time and 62\% energy on average compared to representative implementations of specific applications from existing literature.},
  file = {material/multi_search_energy.pdf},
  projtag = {xscala, grateful, hpda},
  keywords = {hpda, graph analysis, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {grateful, xscala}
}
@misc{dmml-2015,
  file = {material/dmml-2015-ejr.pdf},
  author = {E. Jason Riedy},
  title = {Graph Analysis Beyond Linear Algebra},
  howpublished = {Development of Modern Methods for Linear Algebra},
  month = {October},
  dom = 24,
  year = 2015,
  abstract = {High-performance graph analysis is unlocking knowledge in computer security, bioinformatics, social networks, and many other data integration areas. Graphs provide a convenient abstraction for many data problems beyond linear algebra. Some problems map directly to linear algebra. Others, like community detection, look eerily similar to sparse linear algebra techniques. And then there are algorithms that strongly resist attempts at making them look like linear algebra. This talk will cover recent results with an emphasis on streaming graph problems where the graph changes and results need updated with minimal latency. We’ll also touch on issues of sensitivity and reliability where graph analysis needs to learn from numerical analysis and linear algebra.},
  note = {Invited presentation},
  url = {http://www.slideshare.net/jasonriedy/graph-analysis-beyond-linear-algebra},
  projtag = {xscala, grateful, hpda, lapack},
  keywords = {lapack, blas, linear algebra, graph analysis, streaming data},
  ejr-proj = {linear-algebra, graph-analysis, high-performance-data-analysis},
  ejr-grant = {xscala, grateful, hpda}
}
@article{DBLP:journals/corr/DukhanVR16,
  author = {Marat Dukhan and
               Richard W. Vuduc and
               E. Jason Riedy},
  title = {Wanted: Floating-Point Add Round-off Error instruction},
  journal = {CoRR},
  volume = {abs/1603.00491},
  year = {2016},
  url = {http://arxiv.org/abs/1603.00491},
  archiveprefix = {arXiv},
  eprint = {1603.00491},
  timestamp = {Mon, 13 Aug 2018 16:48:45 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/DukhanVR16},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{caa16,
  author = {David Bader and Aleksandra Michalewicz and Oded Green and Jessie Birkett-Rees and Jason Riedy and James Fairbanks and Anita Zakrzewska},
  title = {Semantic database applications at the {Samtavro} {Cemetery}, {Georgia}},
  booktitle = {The 44th Computer Applications and Quantitative Methods in Archaeology Conference ({CAA})},
  year = 2016,
  month = {March},
  dom = 30,
  address = {Oslo, Norway},
  abstract = {In 2013 a paper was offered to the CAA concerning archaeological legacy data and semantic database applications, with some preliminary results for a study conducted into the Samtavro cemetery, situated in the South Caucasus in the modern republic of Georgia. The present paper presents further research outcomes of data mining the Samtavro material. Over four thousand graves were excavated at this site, used most intensively during the Late Bronze and Iron Ages, and later in the Roman and Late Antique periods. The current project focuses on the latter period—and the legacy of Soviet and post-Soviet excavations—in a collaborative effort between computer scientists based at the Georgia Institute of Technology, USA, and archaeologists at the University of Melbourne and Monash University, Australia.  Data for 1075 tombs, 1249 individuals, and 5842 grave accoutrements were collected across 74 data fields, resulting in the identification of 9 tomb types, 37 artefact types and 320 artefact subtypes. Methods tested against the Samtavro material culture included the application of clustering techniques to understand associations of related items based on patterns of co-occurrence, using traditional data mining (hierarchical link clustering) and spectral graph theory—focusing on tomb types in relation to artefact types. The other method calculated the probability of each event occurring and comparing this to what we would expect if these were truly random—focusing on artefact types in relation to biological sex and age brackets.  In some instances, our work confirmed previously established relationships, but it likewise revealed new results concerning particular entities. The project demonstrates that although sites for which comprehensive archival records exist can benefit from these types of approaches, often the greatest limitation in taking a ‘big data’ approach is the relative scarcity of archaeological data.},
  projtag = {xscala},
  keywords = {graph analysis, archaeology},
  ejr-proj = {graph-analysis},
  ejr-grant = {xscala}
}
@misc{pp16-streaming-ms,
  author = {E. Jason Riedy and David A. Bader},
  ejr-withauthor = {David A. Bader},
  title = {Scalable Network Analysis: Tools, Algorithms, Applications},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = 15,
  month = {April},
  year = 2016,
  note = {Minisymposium organizer with Henning Meyerhenke and David A. Bader.},
  url = {http://www.slideshare.net/jasonriedy/scalable-and-efficient-algorithms-for-analysis-of-massive-streaming-graphs-60975076},
  optrole = {presentation},
  opttags = {siam; streaming data; parallel algorithms},
  address = {Paris, France},
  abstract = {Graph analysis provides tools for analyzing the irregular data sets common in health informatics, computational biology, climate science, sociology, security, finance, and many other fields. These graphs possess different structures than typical finite element meshes. Scaling graph analysis to the scales of data being gathered and created has spawned many directions of exciting new research.  This minisymposium includes talks on massive graph generation for testing and evaluating parallel algorithms, novel streaming techniques, and parallel graph algorithms for new and existing problems. It also covers existing parallel frameworks and interdisciplinary applications, e.g. the analysis of climate networks.},
  projtag = {hpda, grateful, xscala},
  keywords = {hpda, parallel algorithm, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {cassmt, grateful}
}
@inproceedings{gabb16-pr,
  author = {Jason Riedy},
  title = {Updating {PageRank} for Streaming Graphs},
  booktitle = {{Graph} {Algorithms} {Building} {Blocks} ({GABB} 2016)},
  year = 2016,
  dom = 23,
  month = {May},
  address = {Chicago, IL},
  note = {(Workshop with IPDPS 2016)},
  opttags = {parallel; graph; streaming; pagerank},
  abstract = {Incremental graph algorithms can respond quickly to small changes in massive graphs by updating rather than recomputing analysis metrics.  Here we use the linear system formulation of PageRank and ideas from iterative refinement to compute the update to a PageRank vector accurately and quickly.  The core idea is to express the residual of the original solution with respect to the updated matrix representing the graph.  The update to the residual is sparse.  Solving for the solution update with a straight-forward iterative method spreads the change outward from the change locations but converges before traversing the entire graph.  We achieve speed-ups of 2$\times$ to over 40$\times$ relative to a restarted, highly parallel PageRank iteration for small, low-latency batches of edge insertions.  These cases traverse 2$\times$ to nearly 10\,000$\times$ fewer edges than the restarted PageRank iteration.  This provides an interesting test case for the ongoing GraphBLAS effort: Can the APIs support our incremental algorithms cleanly and efficiently?},
  file = {material/streaming-pagerank-gabb2016.pdf},
  projtag = {grateful, hpda, xscala},
  keywords = {hpda, graph analysis, streaming data, parallel algorithm},
  ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch},
  ejr-grant = {hpda, grateful, xscala}
}
@inproceedings{pmma16-fpaddre,
  author = {{Dukhan}, Marat and {Vuduc}, Richard and {Riedy}, Jason},
  title = {Wanted: Floating-Point Add Round-off Error Instruction},
  booktitle = {The 2nd International Workshop on Performance Modeling: Methods and Applications ({PMMA16})},
  year = 2016,
  month = {June},
  dom = 23,
  address = {Frankfurt, Germany},
  note = {(Workshop with ISC High Performance)},
  abstract = {We propose a new instruction (FPADDRE) that computes the round-off error in floating-point addition. We explain how this instruction benefits high-precision arithmetic operations in applications where double precision is not sufficient. Performance estimates on Intel Haswell, Intel Skylake, and AMD Steamroller processors, as well as Intel Knights Corner co-processor, demonstrate that such an instruction would improve the latency of double-double addition by up to 55\% and increase double-double addition throughput by up to 103\%, with smaller, but non-negligible benefits for double-double multiplication. The new instruction delivers up to 2x speedups on three benchmarks that use high-precision floating-point arithmetic: double-double matrix-matrix multiplication, compensated dot product, and polynomial evaluation via the compensated Horner scheme.},
  url = {https://blogs.fau.de/hager/files/2016/06/pmma2016-slides_Dukhan.pdf},
  eprint = {arXiv:1603.00491},
  optprimaryclass = {cs.NA},
  projtag = {ieee754, grateful, lapack, xscala},
  keywords = {floating point, ieee754},
  ejr-proj = {floating-point},
  ejr-grant = {xscala, grateful}
}
@article{Holder:2016:CFC:2980765.2980770,
  author = {Holder, Lawrence B. and Caceres, Rajmonda and Gleich, David F. and Riedy, Jason and Khan, Maleq and Chawla, Nitesh V. and Kumar, Ravi and Wu, Yinghui and Klymko, Christine and Eliassi-Rad, Tina and Prakash, Aditya},
  title = {Current and Future Challenges in Mining Large Networks: Report on the Second SDM Workshop on Mining Networks and Graphs},
  journal = {SIGKDD Explorations Newsletter},
  issue_date = {June 2016},
  volume = 18,
  number = 1,
  month = {August},
  year = 2016,
  issn = {1931-0145},
  pages = {39--45},
  numpages = 7,
  doi = {10.1145/2980765.2980770},
  acmid = 2980770,
  publisher = {ACM},
  address = {New York, NY, USA},
  keywords = {Network mining, big data, challenges, graph mining},
  projtag = {grateful, hpda}
}
@misc{blas-ng-feb-2017,
  author = {James Demmel and Greg Henry and Xiaoye Li and Jason Riedy and Peter Tang},
  title = {A Proposal for a Next-Generation {BLAS}},
  howpublished = {Workshop on Batched, Reproducible, and Reduced Precision BLAS},
  month = {February},
  year = 2017,
  dom = 24,
  address = {Atlanta, Georgia},
  url = {http://www.netlib.org/utk/people/JackDongarra/WEB-PAGES/Batched-BLAS-2017/talk05-demmel.pdf},
  projtag = {lapack, ieee754, xscala},
  keywords = {linear algebra, blas},
  ejr-proj = {floating-point, linear-algebra},
  ejr-grant = {xscala, grateful}
}
@misc{cse17-streaming-ms,
  author = {E. Jason Riedy},
  title = {High-Performance Analysis of Streaming Graphs},
  howpublished = {SIAM Conference on Computational Science and Engineering},
  dom = 2,
  month = {March},
  year = 2017,
  note = {Minisymposium organizer with Henning Meyerhenke.},
  url = {https://www.slideshare.net/jasonriedy/highperformance-analysis-of-streaming-graphs},
  optrole = {presentation},
  opttags = {siam; streaming data; parallel algorithms},
  address = {Atlanta, GA},
  abstract = {Graph-structured data in social networks, finance, network security, and others not only are massive but also under continual change. These changes often are scattered across the graph. Stopping the world to run a single, static query is infeasible. Repeating complex global analyses on massive snapshots to capture only what has changed is inefficient. We discuss requirements for single-shot queries on changing graphs as well as recent high-performance algorithms that update rather than recompute results. These algorithms are incorporated into our software framework for streaming graph analysis, STING (Spatio-Temporal Interaction Networks and Graphs).},
  projtag = {hpda, grateful, memory-centric, xscala, crnch-rg},
  keywords = {hpda, parallel algorithm, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch},
  ejr-grant = {hpda, iarpa-emu, grateful}
}
@misc{acs-2017,
  author = {Jason Riedy},
  title = {High-Performance Analysis of Streaming Graphs},
  howpublished = {HPC Analytic Workshop},
  month = {June},
  year = 2017,
  dom = 28,
  address = {Hanover, MD},
  url = {https://www.slideshare.net/jasonriedy/highperformance-analysis-of-streaming-graphs-77348572},
  projtag = {hpda, grateful, memory-centric, crnch-rg},
  abstract = {Graph-structured data in social networks, finance, network security, and others not only are massive but also under continual change. These changes often are scattered across the graph. Stopping the world to run a single, static query is infeasible. Repeating complex global analyses on massive snapshots to capture only what has changed is inefficient. We discuss requirements for single-shot queries on changing graphs as well as recent high-performance algorithms that update rather than recompute results. These algorithms are incorporated into our software framework for streaming graph analysis, STINGER.},
  keywords = {hpda, graph analysis, streaming data, memory-centric, novel architectures},
  ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch},
  ejr-grant = {hpda, iarpa-emu, grateful}
}
@inproceedings{siamns17-ejr,
  author = {E. Jason Riedy and Chunxing Yin and David A. Bader},
  title = {A New Algorithm Model for Massive-Scale Streaming Graph Analysis},
  booktitle = {SIAM Workshop on Network Science},
  year = 2017,
  month = {July},
  dom = 14,
  address = {Pittsburgh, PA},
  url = {https://www.slideshare.net/jasonriedy/a-new-algorithm-model-for-massivescale-streaming-graph-analysis},
  projtag = {hpda, memory-centric, grateful, crnch-rg},
  keywords = {hpda, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch},
  ejr-grant = {hpda, grateful}
}
@article{localcomm-2017,
  author = {Eisha Nathan and Anita Zakrzewska and Jason Riedy and David A. Bader},
  title = {Local Community Detection in Dynamic Graphs Using Personalized Centrality},
  journal = {Algorithms},
  year = 2017,
  month = {August},
  abstract = {Analyzing massive graphs poses challenges due to the vast amount of data available.  Extracting smaller relevant subgraphs allows for further visualization and analysis that would otherwise be too computationally intensive.  Furthermore, many real data sets are constantly changing, and require algorithms to update as the graph evolves.  This work addresses the topic of local community detection, or seed set expansion, using personalized centrality measures, specifically PageRank and Katz centrality.  We present a method to efficiently update local communities in dynamic graphs. By updating the personalized ranking vectors, we can incrementally update the corresponding local community.  Applying our methods on real-world graphs, we are able to obtain speedups of up to 60$\times$ compared to static recomputation while maintaining an average recall of 0.94 of the highly ranked vertices returned.  Next, we investigate how approximations of a centrality vector affect the resulting local community. Specifically, our method that guarantees that the vertices returned in the community are the highly ranked vertices from a personalized centrality metric.},
  issn = {1999-4893},
  doi = {10.3390/a10030102},
  volume = 10,
  number = 3,
  article_number = 102,
  projtag = {grateful, hpda},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {hpda, grateful, xscala}
}
@misc{ieeecluster2017,
  author = {Eisha Nathan and Anita Zakrzewska and Chunxing Yin and Jason Riedy},
  title = {A New Direction for Streaming Graph Analysis},
  howpublished = {IEEE Cluster},
  month = {September},
  dom = 6,
  year = 2017,
  address = {Honolulu, HI},
  opttags = {graph analysis; parallel algorithms},
  projtag = {hpda, memory-centric, grateful, crnch-rg},
  abstract = {Applications in computer network security, social media analysis, and other areas rely on analyzing a changing environment.  The data is rich in relationships and lends itself to graph analysis.  Traditional static graph analysis cannot keep pace with network security applications analyzing nearly one million events per second and social networks like Facebook collecting 500 thousand comments per second.  Streaming frameworks like STINGER support ingesting up three million of edge changes per second but there are few streaming analysis kernels that keep up with these rates.  Here we introduce a new, non-stop model and use it to decouple the analysis from the data ingest.},
  keywords = {hpda, graph analysis, streaming data, memory-centric, novel architectures},
  ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch},
  ejr-grant = {hpda, iarpa-emu, grateful}
}
@misc{sc17-blas-bof,
  author = {E. Jason Riedy and Greg Henry and James Demmel and Mark Gates and Xiaoye S. Li and Ping Tak P. Tang},
  title = {A Proposal for a Next-Generation {BLAS}},
  howpublished = {Batched, Reproducible, and Reduced Precision BLAS Birds-of-a-Feather at the International Conference for High Performance Computing, Networking, Storage and Analysis},
  month = {November},
  year = 2017,
  url = {http://icl.utk.edu/bblas/sc17/files/bblas-sc17-riedy.pdf},
  projtag = {lapack, ieee754, xscala},
  keywords = {linear algebra, blas},
  ejr-proj = {floating-point, linear-algebra},
  ejr-grant = {xscala, grateful}
}
@unpublished{ngblas-doc,
  author = {James Demmel and Mark Gates and Greg Henry and Xiaoye S. Li and Jason Riedy and P.T. Peter Tang},
  title = {A Proposal for a Next-Generation {BLAS}},
  note = {(living document, being updated)},
  month = {November},
  year = 2017,
  url = {https://goo.gl/hvDu3d},
  projtag = {lapack},
  keywords = {lapack, blas, linear algebra},
  ejr-proj = {linear-algebra, high-performance-data-analysis},
  ejr-grant = {xscala, grateful}
}
@article{DBLP:journals/corr/abs-1811-03743,
  author = {Patrick Lavin and
               E. Jason Riedy and
               Rich Vuduc and
               Jeffrey Young},
  title = {Spatter: {A} Benchmark Suite for Evaluating Sparse Access Patterns},
  journal = {CoRR},
  volume = {abs/1811.03743},
  year = {2018},
  url = {http://arxiv.org/abs/1811.03743},
  archiveprefix = {arXiv},
  opteprint = {1811.03743},
  timestamp = {Fri, 23 Nov 2018 12:43:51 +0100},
  abstract = {Recent characterizations of data movement performance have evaluated optimizations for dense and blocked accesses used by accelerators like GPUs and Xeon Phi, but sparse access patterns like scatter and gather are still not well understood across current and emerging architectures. We propose a tunable benchmark suite, Spatter, that allows users to characterize scatter, gather, and related sparse access patterns at a low level across multiple backends, including CUDA, OpenCL, and OpenMP. Spatter also allows users to vary the block size and amount of data that is moved to create a more comprehensive picture of sparse access patterns and to model patterns that are found in real applications.  With Spatter we aim to characterize the performance of memory systems in a novel way by evaluating how the density of accesses compares against real-world effective memory bandwidths (measured by STREAM) and how it can be compared across widely varying architectures including GPUs and x86, ARM, and Power CPUs. We demonstrate how Spatter can be used to generate analysis plots comparing different architectures and show that current GPU systems achieve up to 65\% of STREAM bandwidth for sparse accesses and are more energy efficient in doing so for several different sparsity patterns. Our future plans for the spatter benchmark are to use these results to predict the impact of new memory access primitives on various architectures, develop backends for novel hardware like FPGAs and the Emu Chick, and automate testing so that users can perform their own sparse access studies.},
  biburl = {https://dblp.org/rec/bib/journals/corr/abs-1811-03743},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1809-07696,
  author = {Jeffrey Young and
               Eric R. Hein and
               Srinivas Eswar and
               Patrick Lavin and
               Jiajia Li and
               E. Jason Riedy and
               Richard W. Vuduc and
               Tom Conte},
  title = {A Microbenchmark Characterization of the {Emu} {Chick}},
  journal = {CoRR},
  volume = {abs/1809.07696},
  year = {2018},
  url = {http://arxiv.org/abs/1809.07696},
  archiveprefix = {arXiv},
  opteprint = {1809.07696},
  timestamp = {Sat, 02 Feb 2019 12:19:08 +0100},
  biburl = {https://dblp.org/rec/bib/journals/corr/abs-1809-07696},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1808-06334,
  author = {Will Powell and
               E. Jason Riedy and
               Jeffrey S. Young and
               Thomas M. Conte},
  title = {Wrangling Rogues: Managing Experimental Post-Moore Architectures},
  journal = {CoRR},
  volume = {abs/1808.06334},
  year = {2018},
  url = {http://arxiv.org/abs/1808.06334},
  archiveprefix = {arXiv},
  opteprint = {1808.06334},
  timestamp = {Sun, 02 Sep 2018 15:01:54 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/abs-1808-06334},
  abstract = {The Rogues Gallery is a new experimental testbed that is focused on tackling "rogue" architectures for the Post-Moore era of computing. While some of these devices have roots in the embedded and high-performance computing spaces, managing current and emerging technologies provides a challenge for system administration that are not always foreseen in traditional data center environments.  We present an overview of the motivations and design of the initial Rogues Gallery testbed and cover some of the unique challenges that we have seen and foresee with upcoming hardware prototypes for future post-Moore research. Specifically, we cover the networking, identity management, scheduling of resources, and tools and sensor access aspects of the Rogues Gallery and techniques we have developed to manage these new platforms.},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@misc{pp18-ejr,
  author = {Jason Riedy},
  title = {Graph Analysis: New Algorithm Models, New Architectures },
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = 8,
  month = {March},
  year = 2018,
  note = {Minisymposium organizer with Oded Green and David A. Bader.},
  optrole = {presentation},
  opttags = {siam; streaming data; parallel algorithms},
  address = {Tokyo, Japan},
  projtag = {hpda, memory-centric, crnch-rg},
  keywords = {hpda, graph analysis, streaming data, memory-centric, novel architectures},
  ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch},
  ejr-grant = {hpda, grateful}
}
@inproceedings{mlg2018_23,
  title = {A New Algorithmic Model for Graph Analysis of Streaming Data},
  author = {Chunxing Yin and Jason Riedy and David A. Bader},
  booktitle = {Proceedings of the 14th International Workshop on Mining and Learning with Graphs ({MLG})},
  month = {May},
  year = 2018,
  url = {http://www.mlgworkshop.org/2018/papers/MLG2018_paper_23.pdf},
  keywords = {hpda, graph analysis, streaming data},
  ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch},
  ejr-grant = {hpda, grateful}
}
@inproceedings{ashes2018-ejr,
  author = {Eric Hein and Tom Conte and Jeffrey S. Young and Srinivas Eswar and Jiajia Li and Patrick Lavin and Richard Vuduc and Jason Riedy},
  title = {An Initial Characterization of the {Emu} {Chick}},
  booktitle = {The Eighth International Workshop on Accelerators and Hybrid Exascale Systems ({AsHES})},
  role = {proceedings},
  opttags = {parallel; graph; streaming; community detection},
  year = 2018,
  month = {May},
  doi = {10.1109/IPDPSW.2018.00097},
  pages = {579--588},
  isbn = 9781538655559,
  abstract = {The Emu Chick is a prototype system designed around the concept of migratory memory-side processing. Rather than transferring large amounts of data across power-hungry, high-latency interconnects, the Emu Chick moves lightweight thread contexts to near-memory cores before the beginning of each memory read. The current prototype hardware uses FPGAs to implement cache-less "Gossamer" cores for doing computational work and a stationary core to run basic operating system functions and migrate threads between nodes. In this initial characterization of the Emu Chick, we study the memory bandwidth characteristics of the system through benchmarks like STREAM, pointer chasing, and sparse matrix vector multiply. We compare the Emu Chick hardware to architectural simulation and Intel Xeon-based platforms. While it is difficult to accurately compare prototype hardware with existing systems, our initial evaluation demonstrates that the Emu Chick uses available memory bandwidth more efficiently than a more traditional, cache-based architecture. Moreover, the Emu Chick provides stable, predictable performance with 80\% bandwidth utilization on a random-access pointer chasing benchmark with weak locality.},
  keywords = {Instruction sets;Bandwidth;Computer architecture;Benchmark testing;Hardware;Prototypes;Kernel;benchmarking;streaming graphs;computer architecture;sparse tensors;emu},
  projtag = {memory-centric, crnch-rg},
  keywords = {hpda, memory-centric, novel architectures},
  ejr-proj = {high-performance-data-analysis, novel-arch},
  ejr-grant = {hpda, iarpa-emu}
}
@inproceedings{cf18-ejr,
  author = {Jason Riedy},
  title = {Streaming Graph Analysis: New Models, New Architectures },
  booktitle = {{ACM} International Conference on Computing Frontiers},
  year = 2018,
  month = {May},
  dom = 9,
  note = {Invited talk.},
  url = {https://www.slideshare.net/jasonriedy/graph-analysis-new-algorithm-models-new-architectures},
  projtag = {hpda, xscala, grateful},
  keywords = {hpda, graph analysis, streaming data, memory-centric, novel architectures},
  ejr-proj = {high-performance-data-analysis, graph-analysis, novel-arch}
}
@inproceedings{arith18-ejr,
  author = {Jason Riedy and James Demmel},
  title = {Augmented Arithmetic Operations Proposed for {IEEE}-754 2018},
  booktitle = {25th {IEEE} Symposium on Computer Arithmetic ({ARITH} 25)},
  year = 2018,
  dom = 26,
  month = {June},
  doi = {10.1109/ARITH.2018.8464813},
  projtag = {ieee754, lapack, grateful, xscala},
  keywords = {floating point, ieee754},
  ejr-proj = {floating-point},
  ejr-grant = {xscala, grateful}
}
@inproceedings{arith18-754,
  author = {Jason Riedy},
  title = {Plans for {IEEE} Standard 754-2028},
  booktitle = {25th {IEEE} Symposium on Computer Arithmetic ({ARITH} 25)},
  year = 2018,
  dom = 26,
  month = {June},
  url = {https://www.slideshare.net/jasonriedy/plans-for-ieee-standard-7542028},
  projtag = {xscala, grateful, ieee754, memory-centric},
  keywords = {ieee754, floating point, memory centric, linear algebra},
  ejr-proj = {floating-point, linear-algebra},
  ejr-grant = {xscala, grateful},
  note = {Invited talk}
}
@article{reproblas-siam-news,
  author = {James Demmel and Jason Riedy and Peter Ahrens},
  title = {Reproducible {BLAS}: Make Addition Associative Again!},
  journal = {{SIAM} News},
  year = {2018},
  volume = 51,
  number = 8,
  month = {October},
  pages = {8},
  url = {https://sinews.siam.org/Details-Page/reproducible-blas-make-addition-associative-again},
  keywords = {linear algebra, floating point, ieee754},
  ejr-proj = {linear-algebra, floating-point}
}
@misc{sc18-blas-bof,
  author = {E. Jason Riedy and Greg Henry and James Demmel and Mark Gates and Xiaoye S. Li and Ping Tak P. Tang},
  title = {Updated Proposal for a Next-Generation {BLAS}},
  howpublished = {Batched, Reproducible, and Reduced Precision BLAS Birds-of-a-Feather at the International Conference for High Performance Computing, Networking, Storage and Analysis},
  month = {November},
  year = 2018,
  abstract = {The classic BLAS interface is concise and mostly predictable.  The BLAS Technical Forum produced a 301-page document in 2001 that incorporated mixed precision and extended operations.  And now we face different implementations for reproducibility, even more precisions, and the batched interfaces.  The explosion of interfaces causes problems for platform optimization and interface generation.  The "Next-Generation BLAS Proposal" provides a unified naming scheme and semantic requirements for extensions.  Inspired by the BLIS project, we also consider a minimal set of microkernels to provide a smaller optimization surface.},
  url = {http://icl.utk.edu/bblas/sc18/files/NG_BLAS_SC18.pdf},
  projtag = {lapack, ieee754, xscala},
  keywords = {linear algebra, blas},
  ejr-proj = {floating-point, linear-algebra}
}
@techreport{ieee754-2019,
  note = {(committee member and contributor)},
  author = {{IEEE 754 Committee}},
  key = {IEEE Std 754-2019},
  journal = {IEEE Std 754-2019},
  type = {IEEE Std},
  number = {754-2019},
  title = {{IEEE} Standard for Floating-Point Arithmetic},
  year = 2019,
  pages = {1 -- 83},
  institution = {Microprocessor Standards Committee of the IEEE Computer Society},
  optabstract = {This standard specifies interchange and arithmetic formats and methods for binary and decimal floating-point arithmetic in computer programming environments. This standard specifies exception conditions and their default handling. An implementation of a floating-point system conforming to this standard may be realized entirely in software, entirely in hardware, or in any combination of software and hardware. For operations specified in the normative part of this standard, numerical results and exceptions are uniquely determined by the values of the input data, sequence of operations, and destination formats, all under user control.},
  keywords = {IEEE standards;floating point arithmetic;programming;IEEE standard;arithmetic formats;computer programming;decimal floating-point arithmetic;754-2008;NaN;arithmetic;binary;computer;decimal;exponent;floating-point;format;interchange;number;rounding;significand;subnormal},
  optdoi = {10.1109/IEEESTD.2008.4610935},
  isbn = {978-1-5044-5897-9},
  address = {New York, NY},
  optmonth = {August},
  optdom = 29,
  projtag = {ieee754},
  url = { https://ieeexplore.ieee.org/servlet/opac?punumber=8739148}
}
@article{DBLP:journals/corr/abs-1901-02775,
  author = {Eric R. Hein and
               Srinivas Eswar and
               Abdurrahman Yasar and
               Jiajia Li and
               Jeffrey S. Young and
               Thomas M. Conte and
               {\"{U}}mit V. {\c{C}}ataly{\"{u}}rek and
               Rich Vuduc and
               E. Jason Riedy and
               Bora U{\c{c}}ar},
  title = {Programming Strategies for Irregular Algorithms on the {Emu} {Chick}},
  journal = {CoRR},
  volume = {abs/1901.02775},
  year = {2019},
  url = {http://arxiv.org/abs/1901.02775},
  archiveprefix = {arXiv},
  opteprint = {1901.02775},
  timestamp = {Sat, 02 Feb 2019 12:19:05 +0100},
  abstract = {The Emu Chick prototype implements migratory
                  memory-side processing in a novel hardware
                  system. Rather than transferring large amounts of
                  data across the system interconnect, the Emu Chick
                  moves lightweight thread contexts to near-memory
                  cores before the beginning of each remote memory
                  read. Previous work has characterized the
                  performance of the Chick prototype in terms of
                  memory bandwidth and programming differences from
                  more typical, non-migratory platforms, but there has
                  not yet been an analysis of algorithms on this
                  system.  This work evaluates irregular algorithms
                  that could benefit from the lightweight, memory-side
                  processing of the Chick and demonstrates techniques
                  and optimization strategies for achieving
                  performance in sparse matrix-vector multiply
                  operation (SpMV), breadth-first search (BFS), and
                  graph alignment across up to eight distributed nodes
                  encompassing 64 nodelets in the Chick system. We
                  also define and justify relative metrics to compare
                  prototype FPGA-based hardware with established ASIC
                  architectures. The Chick currently supports up to
                  68x scaling for graph alignment, 80 MTEPS for BFS on
                  balanced graphs, and 50\ \% of measured STREAM
                  bandwidth for SpMV.},
  biburl = {https://dblp.org/rec/bib/journals/corr/abs-1901-02775},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@misc{emu-lps19,
  author = {E. Jason Riedy},
  title = {Characterization of {Emu} with Microbenchmarks},
  howpublished = {Emu Workshop at the Laboratory for Physical Sciences},
  dom = 23,
  month = {January},
  year = 2019,
  optrole = {presentation},
  address = {Catonsville, MD}
}
@misc{cse19-blas,
  author = {Mark Gates and James W. Demmel and Greg Henry and Xiaoye S. Li and E. Jason Riedy and Peter Tang},
  title = {A Proposal for Next-Generation {BLAS}},
  howpublished = {SIAM Conference on Computational Science and Engineering},
  dom = 26,
  month = {February},
  year = 2019,
  optrole = {presentation},
  address = {Spokane, WA},
  url = {http://icl.utk.edu/bblas/siam-cse19/}
}
@misc{cse19-novel-arch,
  title = {Novel Architectures for Applications in Data Science and Beyond},
  author = {Jason Riedy and Jeffrey Young and Tom Conte},
  howpublished = {SIAM Conference on Computational Science and Engineering},
  dom = 1,
  month = {March},
  year = 2019,
  note = {Minisymposium organizer with Jeffrey Young and Tom Conte.},
  optrole = {presentation},
  address = {Spokane, WA},
  url = {http://www.crnch.gatech.edu/content/siam-cse-2019-go-bananas}
}
@inproceedings{asplos19-tutorial,
  author = {E. Jason Riedy and Jeffrey S. Young},
  title = {Programming Novel Architectures in the Post-{Moore} Era with The {Rogues} {Gallery}},
  booktitle = {24th ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)},
  year = 2019,
  month = {April},
  address = {Providence, RI},
  url = {https://crnch-rg.gitlab.io/asplos-2019/},
  note = {\url{https://crnch-rg.gitlab.io/asplos-2019/}}
}
@inproceedings{pearc19-rogues,
  author = {Will Powell and Jason Riedy and Jeffrey S. Young and Tom Conte},
  title = {Wrangling {Rogues}: A Case Study on Managing Experimental Post-{Moore} Architectures},
  booktitle = {Practice and Experience in Advanced Research Computing ({PEARC} '19)},
  year = 2019,
  month = {July},
  address = {Chicago, IL},
  doi = {10.1145/3332186.3332223}
}
@inproceedings{pearc19-tutorial,
  author = {E. Jason Riedy and Jeffrey S. Young},
  title = {Programming Novel Architectures in the Post-{Moore} Era with the {Rogues} {Gallery}},
  booktitle = {Practice and Experience in Advanced Research Computing ({PEARC})},
  year = 2019,
  month = {July},
  address = {Chicago, IL},
  url = {https://crnch-rg.gitlab.io/pearc-2019/},
  note = {\url{https://crnch-rg.gitlab.io/pearc-2019/}}
}
@misc{iciam19-repro,
  title = {Reproducible Linear Algebra from Application to Architecture},
  author = {Jason Riedy and James Demmel and Peter Ahrens},
  howpublished = {International Congress on Industrial and Applied Mathematics},
  dom = 19,
  month = {July},
  year = 2019,
  optrole = {presentation},
  address = {Valencia, Spain},
  url = {https://www.slideshare.net/jasonriedy/reproducible-linear-algebra-from-application-to-architecture}
}
@misc{iciam19-graph,
  title = {A New Algorithm Model for Massive-Scale Streaming Graph Analysis},
  author = {Chunxing Yin and Jason Riedy},
  howpublished = {International Congress on Industrial and Applied Mathematics},
  dom = 16,
  month = {July},
  year = 2019,
  optrole = {presentation},
  address = {Valencia, Spain},
  url = {https://www.slideshare.net/jasonriedy/a-new-algorithm-model-for-massivescale-streaming-graph-analysis-156808819}
}
@article{parco19-emu,
  author = {Jeffrey Young and Eric Hein and Srinivas Eswar and
                  Patrick Lavin and Jiajia Li and Jason Riedy and
                  Richard Vuduc and Thomas M. Conte},
  title = {A Microbenchmark Characterization of the {Emu}
                  {Chick}},
  month = {September},
  year = 2019,
  doi = {10.1016/j.parco.2019.04.012},
  journal = {Parallel Computing},
  abstract = {The Emu Chick is a prototype system designed around
                  the concept of migratory memory-side
                  processing. Rather than transferring large amounts
                  of data across power-hungry, high-latency
                  interconnects, the Emu Chick moves lightweight
                  thread contexts to near-memory cores before the
                  beginning of each memory read. The current prototype
                  hardware uses FPGAs to implement cache-less
                  ``Gossamer'' cores for doing computational work and
                  a stationary core to run basic operating system
                  functions and migrate threads between nodes. In this
                  multi-node characterization of the Emu Chick, we
                  extend an earlier single-node investigation of the
                  the memory bandwidth characteristics of the system
                  through benchmarks like STREAM, pointer chasing, and
                  sparse matrix-vector multiplication. We compare the
                  Emu Chick hardware to architectural simulation and
                  an Intel Xeon-based platform. Our results
                  demonstrate that for many basic operations the Emu
                  Chick can use available memory bandwidth more
                  efficiently than a more traditional, cache-based
                  architecture although bandwidth usage suffers for
                  computationally intensive workloads like
                  SpMV. Moreover, the Emu Chick provides stable,
                  predictable performance with up to 65\% of the peak
                  bandwidth utilization on a random-access pointer
                  chasing benchmark with weak locality.}
}
@inproceedings{hpec19-yin,
  author = {Chunxing Yin and Jason Riedy},
  title = {Concurrent {Katz} Centrality for Streaming Graphs},
  booktitle = {The IEEE High Performance Extreme Computing Conference (HPEC)},
  year = 2019,
  month = {September},
  address = {Waltham, MA},
  doi = {10.1109/HPEC.2019.8916572},
  keywords = {hpda, graph analysis, parallel algorithm}
}
@misc{arm19-panel,
  title = {Specializing Architectures for Data Analytics},
  author = {David Donofrio and Jason Riedy},
  howpublished = {ARM Research Summit BOF on High Performance Graph Analytics: Algorithms, Programming, Architectures},
  dom = 18,
  month = {September},
  year = 2019,
  note = {Introduction to invited panel on "We can't build specialized architectures for graphs that can work efficiently with other workloads, so we just need to hand-optimize each and every algorithm for each and every architecture"},
  address = {Austin, TX},
  url = {https://hpc.pnl.gov/armbof/}
}
@inproceedings{rg-icrc-2019,
  author = {Jeffrey Young and Jason Riedy and Tom Conte and Vivek Sarkar and Prasanth Chatarasi and Srisehan Srikanth },
  title = {Experimental Insights from the {Rogues} {Gallery} Testbed},
  booktitle = {IEEE International Conference on Rebooting Computing (ICRC19)},
  year = 2019,
  month = {November},
  address = {San Mateo, CA},
  doi = {10.1109/ICRC.2019.8914707}
}
@article{memsys-sg,
  author = {Lavin, Patrick and Young, Jeffrey and Vuduc,
                  Richard and Riedy, Jason and Vose, Aaron and
                  Ernst, Daniel},
  title = {Evaluating Gather and Scatter Performance on
                  CPUs and GPUs},
  journal = {The International Symposium on Memory
                  Systems (MEMSYS)},
  address = {Washington, DC},
  year = 2020,
  month = {Sep},
  doi = {10.1145/3422575.3422794},
  url = {http://dx.doi.org/10.1145/3422575.3422794},
  publisher = {ACM}
}
@misc{siam-pp20-repro,
  title = {Reproducible Linear Algebra from Application to Architecture},
  author = {Jason Riedy and James Demmel and Peter Ahrens},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = 15,
  month = {February},
  year = 2020,
  optrole = {presentation},
  address = {Seattle, WA},
  url = {https://www.slideshare.net/jasonriedy/reproducible-linear-algebra-from-application-to-architecture-228263588}
}
@misc{nsf-icerm-ejr,
  title = {Potential Directions for Moving IEEE-754 Forward},
  author = {Jason Riedy},
  howpublished = {NSF ICERM Workshop on Variable Precision in
                  Mathematical and Scientific Computing},
  dom = 7,
  month = {May},
  year = 2020,
  url = {https://icerm.brown.edu/materials/Slides/htw-20-vp/Potential_Directions_for_Moving_IEEE-754_Forward_\%5D_Jason_Riedy,_Georgia_Institute_of_Technology.pdf}
}
@misc{sparse-days-2020,
  optkey = {},
  author = {Jason Riedy},
  title = {Graph Analysis and Novel Architectures},
  howpublished = {CERFACS Sparse Days},
  month = {September},
  year = 2020,
  dom = 24,
  url = {https://www2.slideshare.net/jasonriedy/graph-analysis-and-novel-architectures}
}
@misc{hpec-graphblas-bof,
  author = {Jason Riedy},
  title = {{GraphBLAS} and {Emus}},
  howpublished = {IEEE HPEC GraphBLAS BoF},
  month = {September},
  year = 2020,
  dom = 22,
  url = {https://www2.slideshare.net/jasonriedy/graphblas-and-emus}
}
@article{10.1145/3418077,
  author = {Hein, Eric R. and Eswar, Srinivas and Ya\c{s}ar,
                  Abdurrahman and Li, Jiajia and Young, Jeffrey S. and
                  Conte, Thomas M. and \c{C}ataly\"{u}rek, \"{U}mit
                  V. and Vuduc, Richard and Riedy, Jason and U\c{c}ar,
                  Bora},
  title = {Programming Strategies for Irregular Algorithms on
                  the Emu Chick},
  year = {2020},
  issue_date = {October 2020},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  volume = {7},
  number = {4},
  issn = {2329-4949},
  opturl = {https://doi.org/10.1145/3418077},
  doi = {10.1145/3418077},
  abstract = {The Emu Chick prototype implements migratory
                  memory-side processing in a novel hardware
                  system. Rather than transferring large amounts of
                  data across the system interconnect, the Emu Chick
                  moves lightweight thread contexts to near-memory
                  cores before the beginning of each remote memory
                  read. Previous work has characterized the
                  performance of the Chick prototype in terms of
                  memory bandwidth and programming differences from
                  more typical, non-migratory platforms, but there has
                  not yet been an analysis of algorithms on this
                  system.This work evaluates irregular algorithms that
                  could benefit from the lightweight, memory-side
                  processing of the Chick and demonstrates techniques
                  and optimization strategies for achieving
                  performance in sparse matrix-vector multiply
                  operation (SpMV), breadth-first search (BFS), and
                  graph alignment across up to eight distributed nodes
                  encompassing 64 nodelets in the Chick system. We
                  also define and justify relative metrics to compare
                  prototype FPGA-based hardware with established ASIC
                  architectures. The Chick currently supports up to
                  68x scaling for graph alignment, 80 MTEPS for BFS on
                  balanced graphs, and 50\% of measured STREAM
                  bandwidth for SpMV.},
  journal = {ACM Trans. Parallel Comput.},
  month = {October},
  articleno = {25},
  numpages = {25},
  keywords = {EMU architecture}
}
@misc{hpec-graphblas-2021,
  optkey = {},
  author = {Jason Riedy},
  title = {Lightning talks: Updates/news from the {GraphBLAS} implementers},
  howpublished = {HPEC GraphBLAS BoF},
  month = {September},
  year = 2021,
  dom = 21,
  url = {https://www.slideshare.net/jasonriedy/lucata-at-the-hpec-graphblas-bof-250439305}
}
@misc{lagraph-2021-10-13,
  author = {Jason Riedy and Shannon Kuntz},
  title = {Lightning talks: Updates/news from the {GraphBLAS} implementers},
  howpublished = {LAGraph meeting},
  month = {October},
  year = 2021,
  dom = 13,
  url = {https://www.slideshare.net/jasonriedy/lagraph-20211013}
}
@inproceedings{barc-2022,
  author = {Jason Riedy},
  title = {Programming on the {Lucata} Data-First Architecture},
  booktitle = {Boston Area Architecture Workshop ({BARC})},
  year = 2022,
  month = {January},
  dom = 22,
  note = {Keynote},
  url = {https://bostonarch.github.io/2022/BARC23.pdf}
}
@article{smith-2022-concur-graph,
  author = {Smith, Emory and Kuntz, Shannon and Riedy,
                  Jason and Deneroff, Martin},
  title = {Concurrent Graph Queries on the Lucata
                  Pathfinder},
  journal = {CoRR},
  year = 2022,
  url = {http://arxiv.org/abs/2209.11889v1},
  abstract = {High-performance analysis of unstructured
                  data like graphs now is critical for
                  applications ranging from business
                  intelligence to genome analysis. Towards
                  this, data centers hold large graphs in
                  memory to serve multiple concurrent queries
                  from different users. Even a single analysis
                  often explores multiple options. Current
                  computing architectures often are not the
                  most time- or energy-efficient solutions.
                  The novel Lucata Pathfinder architecture
                  tackles this problem, combining migratory
                  threads for low-latency reading with
                  memory-side processing for high-performance
                  accumulation. One hundred to 750 concurrent
                  breadth-first searches (BFS) all achieve
                  end-to-end speed-ups of 81 \% to 97 \% over
                  one-at-a-time queries on a graph with 522M
                  edges. Comparing to RedisGraph running on a
                  large Intel-based server, the Pathfinder
                  achieves a 19$\times$ speed-up running 128
                  BFS queries concurrently. The Pathfinder
                  also efficiently supports a mix of
                  concurrent analyses, demonstrated with
                  connected components and BFS.},
  archiveprefix = {arXiv},
  eprint = {2209.11889},
  primaryclass = {cs.DC}
}
@article{demmel-2022-propos-consis,
  author = {Demmel, James and Dongarra, Jack and Gates,
                  Mark and Henry, Greg and Langou, Julien and
                  Li, Xiaoye and Luszczek, Piotr and Pereira,
                  Weslley and Riedy, Jason and
                  Rubio-González, Cindy},
  title = {Proposed Consistent Exception Handling for
                  the {BLAS} and {LAPACK}},
  journal = {CoRR},
  year = 2022,
  url = {http://arxiv.org/abs/2207.09281v1},
  abstract = {Numerical exceptions, which may be caused by
                  overflow, operations like division by 0 or
                  sqrt(-1), or convergence failures, are
                  unavoidable in many cases, in particular
                  when software is used on unforeseen and
                  difficult inputs. As more aspects of society
                  become automated, e.g., self-driving cars,
                  health monitors, and cyber-physical systems
                  more generally, it is becoming increasingly
                  important to design software that is
                  resilient to exceptions, and that responds
                  to them in a consistent way. Consistency is
                  needed to allow users to build higher-level
                  software that is also resilient and
                  consistent (and so on recursively). In this
                  paper we explore the design space of
                  consistent exception handling for the widely
                  used BLAS and LAPACK linear algebra
                  libraries, pointing out a variety of
                  instances of inconsistent exception handling
                  in the current versions, and propose a new
                  design that balances consistency,
                  complexity, ease of use, and performance.
                  Some compromises are needed, because there
                  are preexisting inconsistencies that are
                  outside our control, including in or between
                  existing vendor BLAS implementations,
                  different programming languages, and even
                  compilers for the same programming language.
                  And user requests from our surveys are quite
                  diverse. We also propose our design as a
                  possible model for other numerical software,
                  and welcome comments on our design choices.},
  archiveprefix = {arXiv},
  eprint = {2207.09281},
  primaryclass = {cs.MS}
}
@inproceedings{hpec23-tutorial,
  author = {Jeffrey Young and Patrick Lavin and Jason Riedy and Srinivas Eswar},
  title = {Exploring Graph Analysis for HPC with Near-Memory Accelerators},
  booktitle = {IEEE High Performance Extreme Computing ({HPEC})},
  year = 2022,
  month = {September},
  url = {https://github.com/gt-crnch-rg/lucata-pathfinder-tutorial},
  note = {\url{https://crnch-rg.gitlab.io/pearc-2019/},}
}
@misc{abdelfattah2024interfacesparselinearalgebra,
  title = {Interface for Sparse Linear Algebra Operations},
  author = {Ahmad Abdelfattah and Willow Ahrens and Hartwig Anzt and Chris
                  Armstrong and Ben Brock and Aydin Buluc and Federico Busato
                  and Terry Cojean and Tim Davis and Jim Demmel and Grace Dinh
                  and David Gardener and Jan Fiala and Mark Gates and Azzam
                  Haider and Toshiyuki Imamura and Pedro Valero Lara and Jose
                  Moreira and Sherry Li and Piotr Luszczek and Max Melichenko
                  and Jose Moeira and Yvan Mokwinski and Riley Murray and
                  Spencer Patty and Slaven Peles and Tobias Ribizel and Jason
                  Riedy and Siva Rajamanickam and Piyush Sao and Manu Shantharam
                  and Keita Teranishi and Stan Tomov and Yu-Hsiang Tsai and
                  Heiko Weichelt},
  year = 2024,
  eprint = {2411.13259},
  archiveprefix = {arXiv},
  primaryclass = {cs.MS},
  url = {https://arxiv.org/abs/2411.13259},
  abstract = {The standardization of an interface for dense linear algebra
                  operations in the BLAS standard has enabled interoperability
                  between different linear algebra libraries, thereby boosting
                  the success of scientific computing, in particular in
                  scientific HPC. Despite numerous efforts in the past, the
                  community has not yet agreed on a standardization for sparse
                  linear algebra operations due to numerous reasons. One is the
                  fact that sparse linear algebra objects allow for many
                  different storage formats, and different hardware may favor
                  different storage formats. This makes the definition of a
                  FORTRAN-style all-circumventing interface extremely
                  challenging. Another reason is that opposed to dense linear
                  algebra functionality, in sparse linear algebra, the size of
                  the sparse data structure for the operation result is not
                  always known prior to the information. Furthermore, as opposed
                  to the standardization effort for dense linear algebra, we are
                  late in the technology readiness cycle, and many
                  production-ready software libraries using sparse linear
                  algebra routines have implemented and committed to their own
                  sparse BLAS interface. At the same time, there exists a demand
                  for standardization that would improve interoperability, and
                  sustainability, and allow for easier integration of building
                  blocks. In an inclusive, cross-institutional effort involving
                  numerous academic institutions, US National Labs, and
                  industry, we spent two years designing a hardware-portable
                  interface for basic sparse linear algebra functionality that
                  serves the user needs and is compatible with the different
                  interfaces currently used by different vendors. In this paper,
                  we present a C++ API for sparse linear algebra functionality,
                  discuss the design choices, and detail how software developers
                  preserve a lot of freedom in terms of how to implement
                  functionality behind this API.}
}

This file was generated by bibtex2html 1.99.