R Code to Remove Duplicates from a SQL Server Database

It's easy to remove duplicate rows from an R dataframe using the unique() function:

> df.duplicates <- data.frame(a=c(1,2,3,3), b=c(1,2,3,3))
> df.duplicates
  a b
1 1 1
2 2 2
3 3 3
4 3 3
> unique(df.duplicates)
  a b
1 1 1
2 2 2
3 3 3

But this can be slow for large dataframes. Hideously slow, even.

When I'm reading data from SQL Server into R, I often prefer, therefore, to remove the duplicates within the database before extracting the data. Here's a bit of R code that will make a SQL call to do that:

require(RODBC)

GetSQLServerTableColumnNames <- function(channel, table.name){
   #
   # See: http://stackoverflow.com/questions/600446/sql-server-how-do-you-return-the-column-names-from-a-table
   #

   query <- paste("SELECT
                      syscolumns.name AS [ColumnName]
                   FROM
                      sysobjects, syscolumns
                   WHERE
                          sysobjects.id    = syscolumns.id
                      AND sysobjects.xtype = 'u'
                      AND sysobjects.name  = '"
, table.name, "';", sep="")
   col.names <- sqlQuery(channel, query)
   if (length(col.names==0) stop(paste("Unable to get column names for table", table.name))
   return(col.names)
}

RemoveDuplicateRows <- function(channel, table.name){
   #
   # Remove duplicate rows from a SQL Server table using a SQL call (rather than by removing duplicates from a dataframe)
   #

   col.names <- GetSQLServerTableColumnNames(channel, table.name)

   # If column.names equals c("col1", "col2", "col3"), column.names.string
   # will equal "[col1], [col2], [col3]"
   column.names.string <- paste("[", paste(res$ColumnName, collapse="], ["), "]", sep="")

   query <- paste("SELECT
                      IDENTITY(INT,1,1) AS IDRemoveDuplicateRows,
                      *
                   INTO
                      #temp
                   FROM
                      ["
, table.name, "];

                   DELETE FROM
                      #temp
                   WHERE IDRemoveDuplicateRows NOT IN (
                      SELECT MAX(IDRemoveDuplicateRows)
                      FROM #temp
                      GROUP BY "
, column.names.string, "
                      HAVING MAX(IDRemoveDuplicateRows) IS NOT NULL
                   );

                   DROP TABLE ["
, table.name, "];

                   SELECT
                      "
, column.names.string, "
                   INTO
                      ["
, table.name, "]
                   FROM
                      #temp;

                   DROP TABLE #temp;"
, sep="")
   sqlQuery(channel, query)
}

# Get a connection to a SQL Server database
connect.str <- "driver=SQL Server; server=MyServer; database=MyDB; uid=joedoe; pwd=0000000; Trusted_Connection=TRUE; ReadOnly=FALSE"
channel <- odbcDriverConnect(connect.str)

# Remove duplicates within a SQL Server table
RemoveDuplicateRows(channel, "MyTable")