#!/usr/bin/python ### Calculate and graph correlation between publication fee and ### eigenfactor for open-access journals. ### ### Stuart M. Shieber ### 10/12/2009 ### The correlation coefficients are useful for exploring the issue of ### whether open-access journals operate as a vanity publishing ### industry (like trade-book publishing) in which standards are ### inversely correlated with author fees (that is, positively ### correlated with royalties, which are negative author fees) or as a ### quality publishing industry, in which standards are positively ### correlated with author fees. ### This program assumes that the journal data is stored in a ### comma-separated value file journal-data.csv. It prints a small ### report on the number of journals used in the analysis and the ### correlation coefficient (Pearson's) between the publication fees ### and article influences. It also generates a scatter plot with ### linear fit lines of the data written to vanitypress.png. ### For further discussion, see The Occasional Pamphlet, "Is ### open-access publishing a vanity publishing industry?" import csv # i/o for csv files import numpy # for statistics import matplotlib # for plotting matplotlib.use('Agg') import matplotlib.pyplot as plt ### Utilities # isFloat: test if a string represents a floating point number def isFloat(str): try: float(str) return True except: return False ### Currency conversion # Currency conversion rates as of 1/13/2009 as provided by oanda.com rates = { "CAD": 0.83523, "CHF": 0.89588, "EUR": 1.34011, "GBP": 1.49963, "USD": 1, "CNY": 0.14648 } ### Get journal data reader = csv.DictReader(open('journal-data.csv', 'rU')) charges = [] influences = [] chargesNZ = [] influencesNZ = [] for journal in reader: # Filter out some journals that aren't in the analysis. We only # include journals that charge publication fees, that don't charge # submission fees, and that have an eigenfactor-based article # influence value. if (not isFloat(journal['Publication Charge Per Article']) or (isFloat(journal['Submission Charge']) and float(journal['Submission Charge']) > 0) or not isFloat(journal['Article Influence']) ): continue # Find the proper exchange rate if (journal['Currency'] == "" or journal['Currency'] == "NA"): #print 'No currency spec?', journal['Publication Charge Per Article'], journal['Title'] rate = 1 else: rate = rates[journal['Currency']] # Compute the publication fee in USD charge = float(journal['Publication Charge Per Article']) * rate # Add the data point charges.append(charge) influences.append(float(journal['Article Influence'])) if (charge > 0): # separately for non-zero charges chargesNZ.append(charge) influencesNZ.append(float(journal['Article Influence'])) ### Report correlation coefficients print "Based on %d journals (%d non-zero fee)" % (len(charges), len(chargesNZ)) print "Correlation for all: %5.3f" % numpy.corrcoef([charges, influences])[0][1] print "Correlation for non-zero: %5.3f" % numpy.corrcoef([chargesNZ, influencesNZ])[0][1] ### Generate scatter plot plt.scatter(charges, influences) # Add best-fit line for all and non-zeros slope, intercept = numpy.polyfit(charges,influences, 1) slopeNZ, interceptNZ = numpy.polyfit(chargesNZ,influencesNZ, 1) plt.plot([0,2850], [intercept, slope*2850+intercept], 'g:') plt.plot([0,2850], [interceptNZ, slopeNZ*2850+interceptNZ], 'b:') # Add axes plt.axis([-50, round(max(charges)/100+1)*100, -.25, round(max(influences))+.25]) # Label axes plt.xlabel('Publication Charge (US$)', family='serif') plt.ylabel('Article Influence', family='serif') # Write out the figure plt.savefig('vanitypress')